Import python-pcre2_0.6.0+ds.orig.tar.xz
authorBastian Germann <bage@debian.org>
Mon, 9 Feb 2026 11:01:04 +0000 (12:01 +0100)
committerBastian Germann <bage@debian.org>
Mon, 9 Feb 2026 11:01:04 +0000 (12:01 +0100)
[dgit import orig python-pcre2_0.6.0+ds.orig.tar.xz]

22 files changed:
CMakeLists.txt [new file with mode: 0644]
LICENSE [new file with mode: 0644]
Makefile [new file with mode: 0644]
PKG-INFO [new file with mode: 0644]
README.md [new file with mode: 0755]
pyproject.toml [new file with mode: 0755]
requirements/build-requirements.txt [new file with mode: 0644]
requirements/test-requirements.txt [new file with mode: 0644]
setup.cfg [new file with mode: 0644]
setup.py [new file with mode: 0755]
src/pcre2.egg-info/PKG-INFO [new file with mode: 0644]
src/pcre2.egg-info/SOURCES.txt [new file with mode: 0644]
src/pcre2.egg-info/dependency_links.txt [new file with mode: 0644]
src/pcre2.egg-info/top_level.txt [new file with mode: 0644]
src/pcre2/CMakeLists.txt [new file with mode: 0644]
src/pcre2/__init__.py [new file with mode: 0755]
src/pcre2/_cy.pyx [new file with mode: 0644]
src/pcre2/_libpcre2.pxd [new file with mode: 0755]
tests/test_groups.py [new file with mode: 0644]
tests/test_match.py [new file with mode: 0644]
tests/test_pattern.py [new file with mode: 0644]
tests/test_re_compatibility.py [new file with mode: 0644]

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644 (file)
index 0000000..2dddcfb
--- /dev/null
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.7.2)
+
+project(pcre2)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_C_STANDARD 99)
+
+set(PCRE2_INCLUDE_DIR ${CMAKE_BINARY_DIR}/src/libpcre2/interface)
+set(CYTHON_EXTRA_COMPILE_ARGS -DPCRE2_CODE_UNIT_WIDTH=8 -fPIC)
+
+# Set PCRE2 options.
+set(PCRE2_SUPPORT_JIT ON CACHE BOOL "" FORCE)
+set(PCRE2_NEVER_BACKSLASH_C ON CACHE BOOL "" FORCE)
+
+# Always make a release build.
+set(CMAKE_BUILD_TYPE Release)
+
+# Build PCRE2 library as both shared and static.
+set(BUILD_STATIC_LIBS ON)
+set(BUILD_SHARED_LIBS ON)
+add_subdirectory(src/libpcre2)
+
+# Build Cython code as shared.
+set(BUILD_STATIC_LIBS OFF)
+set(BUILD_SHARED_LIBS ON)
+add_subdirectory(src/pcre2)
+
+# Include PCRE2 header for Cython API.
+install(FILES ${PCRE2_INCLUDE_DIR}/pcre2.h DESTINATION src/pcre2)
diff --git a/LICENSE b/LICENSE
new file mode 100644 (file)
index 0000000..4a57011
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, grtetrault
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..8755fcc
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,27 @@
+SHELL = /bin/bash
+
+init:
+       git submodule update --init --recursive
+       python3 -m venv ./.venv
+       ./.venv/bin/pip install -r ./requirements/build-requirements.txt
+       ./.venv/bin/pip install -r ./requirements/test-requirements.txt
+       ./.venv/bin/pip install .
+
+build:
+       ./.venv/bin/pip install . --force-reinstall
+
+clean:
+       rm -rf ./dist
+       rm -rf ./build
+       rm -rf ./_skbuild
+       find ./src/pcre2 -type f -name '*.c' -print0 | xargs -0 rm -vf
+       find ./src/pcre2 -type f -name '*.html' -print0 | xargs -0 rm -vf
+       find . -type f -name '*.pyc' | xargs rm -r
+       find . -type d -name '*.egg-info' | xargs rm -r
+       find . -type d -name '*.ipynb_checkpoints' | xargs rm -r
+
+purge:
+       rm -rf ./.venv
+
+benchmark:
+       ./.venv/bin/python ./benchmarks/run_regex_redux.py
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644 (file)
index 0000000..b186c2a
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,150 @@
+Metadata-Version: 2.4
+Name: pcre2
+Version: 0.6.0
+Summary: Python bindings for the PCRE2 regular expression library
+Home-page: https://github.com/grtetrault/pcre2.py
+Author: Garrett Tetrault
+License: BSD 3-Clause License
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Cython
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: author
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: summary
+
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+This library aims to be compatible with Python's built-in `re` module. In many cases, this means
+that `pcre2` can drop-in replace `re` to gain some performance (see benchmarks below).
+However, PCRE2 and Python implement different regex specifications, so patterns and behavior will
+not always be translatable (e.g., the syntax for group replacement differs).
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and
+bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and
+can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, flags=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit
+True
+>>> patn.groupindex
+{'head': 1, 'tail': 2}
+>>> patn.flags
+<CompileOption.IGNORECASE: 8>
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match[0]
+'foo bar'
+>>> match.span()
+(0, 7)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.sub(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.sub(repl, subj, count=1)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.finditer(subj):
+...     print(match.group('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script              | Number of runs | Total time | Real time  | User time   | System time   |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| baseline.py         |             10 |      3.230 |      0.323 |       0.020 |         0.100 |
+| re_vanilla.py       |             10 |     51.090 |      5.109 |      11.375 |         0.530 |
+| pcre2_vanilla.py    |             10 |     21.980 |      2.198 |       3.154 |         0.483 |
+| pcre2_optimized.py  |             10 |     14.860 |      1.486 |       2.520 |         0.548 |
+| cffi_optimized.py   |             10 |     14.130 |      1.413 |       3.111 |         0.411 |
+Script descriptions are as follows,
+
+| Script              | Description                                                          |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py`       | Reads input file and outputs stored expected output                  |
+| `re_vanilla.py`     | Pure Python version                                                  |
+| `re_vanilla.py`     | Same as `re_vanilla.py`, with `pcre2` drop-in replacing `re`         |
+| `pcre2_module.py`   | More optimized implementation using `pcre2`                          |
+| `cffi_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
diff --git a/README.md b/README.md
new file mode 100755 (executable)
index 0000000..9cb6b16
--- /dev/null
+++ b/README.md
@@ -0,0 +1,116 @@
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+This library aims to be compatible with Python's built-in `re` module. In many cases, this means
+that `pcre2` can drop-in replace `re` to gain some performance (see benchmarks below).
+However, PCRE2 and Python implement different regex specifications, so patterns and behavior will
+not always be translatable (e.g., the syntax for group replacement differs).
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and
+bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and
+can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, flags=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit
+True
+>>> patn.groupindex
+{'head': 1, 'tail': 2}
+>>> patn.flags
+<CompileOption.IGNORECASE: 8>
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match[0]
+'foo bar'
+>>> match.span()
+(0, 7)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.sub(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.sub(repl, subj, count=1)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.finditer(subj):
+...     print(match.group('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script              | Number of runs | Total time | Real time  | User time   | System time   |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| baseline.py         |             10 |      3.230 |      0.323 |       0.020 |         0.100 |
+| re_vanilla.py       |             10 |     51.090 |      5.109 |      11.375 |         0.530 |
+| pcre2_vanilla.py    |             10 |     21.980 |      2.198 |       3.154 |         0.483 |
+| pcre2_optimized.py  |             10 |     14.860 |      1.486 |       2.520 |         0.548 |
+| cffi_optimized.py   |             10 |     14.130 |      1.413 |       3.111 |         0.411 |
+Script descriptions are as follows,
+
+| Script              | Description                                                          |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py`       | Reads input file and outputs stored expected output                  |
+| `re_vanilla.py`     | Pure Python version                                                  |
+| `re_vanilla.py`     | Same as `re_vanilla.py`, with `pcre2` drop-in replacing `re`         |
+| `pcre2_module.py`   | More optimized implementation using `pcre2`                          |
+| `cffi_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100755 (executable)
index 0000000..c0f420a
--- /dev/null
@@ -0,0 +1,8 @@
+[build-system]
+requires = [
+  "setuptools>=42",
+  "scikit-build",
+  "Cython",
+  "cmake"
+]
+build-backend = "setuptools.build_meta"
diff --git a/requirements/build-requirements.txt b/requirements/build-requirements.txt
new file mode 100644 (file)
index 0000000..067a22d
--- /dev/null
@@ -0,0 +1,6 @@
+requests
+build
+wheel
+scikit-build
+cmake
+Cython
\ No newline at end of file
diff --git a/requirements/test-requirements.txt b/requirements/test-requirements.txt
new file mode 100644 (file)
index 0000000..209b771
--- /dev/null
@@ -0,0 +1,3 @@
+twine
+pytest
+gitpython
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644 (file)
index 0000000..8bfd5a1
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,4 @@
+[egg_info]
+tag_build = 
+tag_date = 0
+
diff --git a/setup.py b/setup.py
new file mode 100755 (executable)
index 0000000..ddd7ba2
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,48 @@
+# -*- coding:utf-8 -*-
+
+import os
+import skbuild
+import setuptools
+
+
+def get_long_desciption():
+    cwd = os.path.abspath(os.path.dirname(__file__))
+    filename = os.path.join(cwd, "README.md")
+    with open(filename) as f:
+        long_description = f.read()
+
+    return long_description
+
+
+skbuild.setup(
+    name="pcre2",
+    version="0.6.0",
+    description="Python bindings for the PCRE2 regular expression library",
+    long_description=get_long_desciption(),
+    long_description_content_type="text/markdown",
+    license="BSD 3-Clause License",
+    author="Garrett Tetrault",
+    url="https://github.com/grtetrault/pcre2.py",
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: BSD License",
+        "Programming Language :: C",
+        "Programming Language :: Cython",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
+        "Programming Language :: Python :: 3.14",
+        "Operating System :: MacOS :: MacOS X",
+        "Operating System :: POSIX :: Linux",
+        "Operating System :: Microsoft :: Windows",
+    ],
+    include_package_data=True,
+    packages=setuptools.find_packages("src"),
+    package_dir={"": "src"},
+    cmake_languages="C",
+)
diff --git a/src/pcre2.egg-info/PKG-INFO b/src/pcre2.egg-info/PKG-INFO
new file mode 100644 (file)
index 0000000..b186c2a
--- /dev/null
@@ -0,0 +1,150 @@
+Metadata-Version: 2.4
+Name: pcre2
+Version: 0.6.0
+Summary: Python bindings for the PCRE2 regular expression library
+Home-page: https://github.com/grtetrault/pcre2.py
+Author: Garrett Tetrault
+License: BSD 3-Clause License
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Cython
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: author
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: summary
+
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+This library aims to be compatible with Python's built-in `re` module. In many cases, this means
+that `pcre2` can drop-in replace `re` to gain some performance (see benchmarks below).
+However, PCRE2 and Python implement different regex specifications, so patterns and behavior will
+not always be translatable (e.g., the syntax for group replacement differs).
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and
+bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and
+can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, flags=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit
+True
+>>> patn.groupindex
+{'head': 1, 'tail': 2}
+>>> patn.flags
+<CompileOption.IGNORECASE: 8>
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match[0]
+'foo bar'
+>>> match.span()
+(0, 7)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.sub(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.sub(repl, subj, count=1)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.finditer(subj):
+...     print(match.group('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script              | Number of runs | Total time | Real time  | User time   | System time   |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| baseline.py         |             10 |      3.230 |      0.323 |       0.020 |         0.100 |
+| re_vanilla.py       |             10 |     51.090 |      5.109 |      11.375 |         0.530 |
+| pcre2_vanilla.py    |             10 |     21.980 |      2.198 |       3.154 |         0.483 |
+| pcre2_optimized.py  |             10 |     14.860 |      1.486 |       2.520 |         0.548 |
+| cffi_optimized.py   |             10 |     14.130 |      1.413 |       3.111 |         0.411 |
+Script descriptions are as follows,
+
+| Script              | Description                                                          |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py`       | Reads input file and outputs stored expected output                  |
+| `re_vanilla.py`     | Pure Python version                                                  |
+| `re_vanilla.py`     | Same as `re_vanilla.py`, with `pcre2` drop-in replacing `re`         |
+| `pcre2_module.py`   | More optimized implementation using `pcre2`                          |
+| `cffi_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
diff --git a/src/pcre2.egg-info/SOURCES.txt b/src/pcre2.egg-info/SOURCES.txt
new file mode 100644 (file)
index 0000000..e06b222
--- /dev/null
@@ -0,0 +1,642 @@
+CMakeLists.txt
+LICENSE
+Makefile
+README.md
+pyproject.toml
+setup.py
+requirements/build-requirements.txt
+requirements/test-requirements.txt
+src/libpcre2/.editorconfig
+src/libpcre2/.git
+src/libpcre2/.gitattributes
+src/libpcre2/.gitignore
+src/libpcre2/.gitmodules
+src/libpcre2/AUTHORS.md
+src/libpcre2/BUILD.bazel
+src/libpcre2/CMakeLists.txt
+src/libpcre2/COPYING
+src/libpcre2/ChangeLog
+src/libpcre2/HACKING
+src/libpcre2/INSTALL
+src/libpcre2/LICENCE.md
+src/libpcre2/MODULE.bazel
+src/libpcre2/Makefile.am
+src/libpcre2/Makefile.in
+src/libpcre2/NEWS
+src/libpcre2/NON-AUTOTOOLS-BUILD
+src/libpcre2/README
+src/libpcre2/README.md
+src/libpcre2/RunGrepTest
+src/libpcre2/RunGrepTest.bat
+src/libpcre2/RunTest
+src/libpcre2/RunTest.bat
+src/libpcre2/SECURITY.md
+src/libpcre2/aclocal.m4
+src/libpcre2/ar-lib
+src/libpcre2/autogen.sh
+src/libpcre2/build.zig
+src/libpcre2/compile
+src/libpcre2/config.guess
+src/libpcre2/config.sub
+src/libpcre2/configure
+src/libpcre2/configure.ac
+src/libpcre2/depcomp
+src/libpcre2/install-sh
+src/libpcre2/libpcre2-16.pc.in
+src/libpcre2/libpcre2-32.pc.in
+src/libpcre2/libpcre2-8.pc.in
+src/libpcre2/libpcre2-posix.pc.in
+src/libpcre2/ltmain.sh
+src/libpcre2/missing
+src/libpcre2/pcre2-config.in
+src/libpcre2/perltest.sh
+src/libpcre2/test-driver
+src/libpcre2/.github/codecov.yml
+src/libpcre2/.github/dependabot.yml
+src/libpcre2/.github/scripts/merge_sarif.py
+src/libpcre2/.github/workflows/build.yml
+src/libpcre2/.github/workflows/cifuzz.yml
+src/libpcre2/.github/workflows/clang-analyzer.yml
+src/libpcre2/.github/workflows/codeql.yml
+src/libpcre2/.github/workflows/dev.yml
+src/libpcre2/.github/workflows/pages.yml
+src/libpcre2/.github/workflows/scorecards.yml
+src/libpcre2/.github/workflows/sync.yml
+src/libpcre2/cmake/COPYING-CMAKE-SCRIPTS
+src/libpcre2/cmake/FindEditline.cmake
+src/libpcre2/cmake/FindReadline.cmake
+src/libpcre2/cmake/PCRE2CheckVscript.cmake
+src/libpcre2/cmake/PCRE2UseSystemExtensions.cmake
+src/libpcre2/cmake/PCRE2WarningAsError.cmake
+src/libpcre2/cmake/pcre2-config.cmake.in
+src/libpcre2/deps/sljit/.git
+src/libpcre2/deps/sljit/.gitignore
+src/libpcre2/deps/sljit/API_CHANGES
+src/libpcre2/deps/sljit/CMakeLists.txt
+src/libpcre2/deps/sljit/GNUmakefile
+src/libpcre2/deps/sljit/INTERNAL_CHANGES
+src/libpcre2/deps/sljit/LICENSE
+src/libpcre2/deps/sljit/README.md
+src/libpcre2/deps/sljit/.github/workflows/actions.yml
+src/libpcre2/deps/sljit/docs/README.md
+src/libpcre2/deps/sljit/docs/general/architecture.md
+src/libpcre2/deps/sljit/docs/general/contributing.md
+src/libpcre2/deps/sljit/docs/general/introduction.md
+src/libpcre2/deps/sljit/docs/general/getting-started/_category_.json
+src/libpcre2/deps/sljit/docs/general/getting-started/configuration.md
+src/libpcre2/deps/sljit/docs/general/getting-started/setup.md
+src/libpcre2/deps/sljit/docs/general/use-cases/_category_.json
+src/libpcre2/deps/sljit/docs/general/use-cases/bytecode-interpreters.md
+src/libpcre2/deps/sljit/docs/general/use-cases/overview.md
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/_category_.json
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/figure1.svg
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/performance-comparison.md
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/regular-expression-engine-types.md
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/speeding-up-pcre2-with-sljit.md
+src/libpcre2/deps/sljit/docs/tutorial/01-overview.md
+src/libpcre2/deps/sljit/docs/tutorial/02-your-first-program.md
+src/libpcre2/deps/sljit/docs/tutorial/03-branching.md
+src/libpcre2/deps/sljit/docs/tutorial/04-calling-external-functions.md
+src/libpcre2/deps/sljit/docs/tutorial/05-accessing-structures.md
+src/libpcre2/deps/sljit/docs/tutorial/06-accessing-arrays.md
+src/libpcre2/deps/sljit/docs/tutorial/07-local-variables.md
+src/libpcre2/deps/sljit/docs/tutorial/08-where-to-go-from-here.md
+src/libpcre2/deps/sljit/docs/tutorial/sources/99bottles.bf
+src/libpcre2/deps/sljit/docs/tutorial/sources/array_access.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/brainfuck.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/branch.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/first_program.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/func_call.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/hello.bf
+src/libpcre2/deps/sljit/docs/tutorial/sources/loop.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/struct_access.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/temp_var.c
+src/libpcre2/deps/sljit/docs/website/.gitignore
+src/libpcre2/deps/sljit/docs/website/README.md
+src/libpcre2/deps/sljit/docs/website/docusaurus.config.js
+src/libpcre2/deps/sljit/docs/website/package-lock.json
+src/libpcre2/deps/sljit/docs/website/package.json
+src/libpcre2/deps/sljit/docs/website/sidebars.js
+src/libpcre2/deps/sljit/docs/website/src/components/HomepageFeatures/index.js
+src/libpcre2/deps/sljit/docs/website/src/components/HomepageFeatures/styles.module.css
+src/libpcre2/deps/sljit/docs/website/src/css/custom.css
+src/libpcre2/deps/sljit/docs/website/src/pages/index.js
+src/libpcre2/deps/sljit/docs/website/src/pages/index.module.css
+src/libpcre2/deps/sljit/docs/website/static/.nojekyll
+src/libpcre2/deps/sljit/docs/website/static/assets/regex-test.tgz
+src/libpcre2/deps/sljit/regex_src/regexJIT.c
+src/libpcre2/deps/sljit/regex_src/regexJIT.h
+src/libpcre2/deps/sljit/regex_src/regexMain.c
+src/libpcre2/deps/sljit/sljit_src/sljitConfig.h
+src/libpcre2/deps/sljit/sljit_src/sljitConfigCPU.h
+src/libpcre2/deps/sljit/sljit_src/sljitConfigInternal.h
+src/libpcre2/deps/sljit/sljit_src/sljitLir.c
+src/libpcre2/deps/sljit/sljit_src/sljitLir.h
+src/libpcre2/deps/sljit/sljit_src/sljitNativeARM_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeARM_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeARM_T2_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeLOONGARCH_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeMIPS_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeMIPS_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeMIPS_common.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativePPC_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativePPC_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativePPC_common.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeRISCV_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeRISCV_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeRISCV_common.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeS390X.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeX86_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeX86_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeX86_common.c
+src/libpcre2/deps/sljit/sljit_src/sljitSerialize.c
+src/libpcre2/deps/sljit/sljit_src/sljitUtils.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorApple.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorCore.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorFreeBSD.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorPosix.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorWindows.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitProtExecAllocatorNetBSD.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitProtExecAllocatorPosix.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitWXExecAllocatorPosix.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitWXExecAllocatorWindows.c
+src/libpcre2/deps/sljit/test_src/sljitConfigPost.h
+src/libpcre2/deps/sljit/test_src/sljitConfigPre.h
+src/libpcre2/deps/sljit/test_src/sljitMain.c
+src/libpcre2/deps/sljit/test_src/sljitTest.c
+src/libpcre2/deps/sljit/test_src/sljitTestBuffers.h
+src/libpcre2/deps/sljit/test_src/sljitTestCall.h
+src/libpcre2/deps/sljit/test_src/sljitTestFloat.h
+src/libpcre2/deps/sljit/test_src/sljitTestSerialize.h
+src/libpcre2/deps/sljit/test_src/sljitTestSimd.h
+src/libpcre2/doc/index.html.src
+src/libpcre2/doc/pcre2-config.1
+src/libpcre2/doc/pcre2-config.txt
+src/libpcre2/doc/pcre2.3
+src/libpcre2/doc/pcre2.txt
+src/libpcre2/doc/pcre2_callout_enumerate.3
+src/libpcre2/doc/pcre2_code_copy.3
+src/libpcre2/doc/pcre2_code_copy_with_tables.3
+src/libpcre2/doc/pcre2_code_free.3
+src/libpcre2/doc/pcre2_compile.3
+src/libpcre2/doc/pcre2_compile_context_copy.3
+src/libpcre2/doc/pcre2_compile_context_create.3
+src/libpcre2/doc/pcre2_compile_context_free.3
+src/libpcre2/doc/pcre2_config.3
+src/libpcre2/doc/pcre2_convert_context_copy.3
+src/libpcre2/doc/pcre2_convert_context_create.3
+src/libpcre2/doc/pcre2_convert_context_free.3
+src/libpcre2/doc/pcre2_converted_pattern_free.3
+src/libpcre2/doc/pcre2_dfa_match.3
+src/libpcre2/doc/pcre2_general_context_copy.3
+src/libpcre2/doc/pcre2_general_context_create.3
+src/libpcre2/doc/pcre2_general_context_free.3
+src/libpcre2/doc/pcre2_get_error_message.3
+src/libpcre2/doc/pcre2_get_mark.3
+src/libpcre2/doc/pcre2_get_match_data_heapframes_size.3
+src/libpcre2/doc/pcre2_get_match_data_size.3
+src/libpcre2/doc/pcre2_get_ovector_count.3
+src/libpcre2/doc/pcre2_get_ovector_pointer.3
+src/libpcre2/doc/pcre2_get_startchar.3
+src/libpcre2/doc/pcre2_jit_compile.3
+src/libpcre2/doc/pcre2_jit_free_unused_memory.3
+src/libpcre2/doc/pcre2_jit_match.3
+src/libpcre2/doc/pcre2_jit_stack_assign.3
+src/libpcre2/doc/pcre2_jit_stack_create.3
+src/libpcre2/doc/pcre2_jit_stack_free.3
+src/libpcre2/doc/pcre2_maketables.3
+src/libpcre2/doc/pcre2_maketables_free.3
+src/libpcre2/doc/pcre2_match.3
+src/libpcre2/doc/pcre2_match_context_copy.3
+src/libpcre2/doc/pcre2_match_context_create.3
+src/libpcre2/doc/pcre2_match_context_free.3
+src/libpcre2/doc/pcre2_match_data_create.3
+src/libpcre2/doc/pcre2_match_data_create_from_pattern.3
+src/libpcre2/doc/pcre2_match_data_free.3
+src/libpcre2/doc/pcre2_next_match.3
+src/libpcre2/doc/pcre2_pattern_convert.3
+src/libpcre2/doc/pcre2_pattern_info.3
+src/libpcre2/doc/pcre2_serialize_decode.3
+src/libpcre2/doc/pcre2_serialize_encode.3
+src/libpcre2/doc/pcre2_serialize_free.3
+src/libpcre2/doc/pcre2_serialize_get_number_of_codes.3
+src/libpcre2/doc/pcre2_set_bsr.3
+src/libpcre2/doc/pcre2_set_callout.3
+src/libpcre2/doc/pcre2_set_character_tables.3
+src/libpcre2/doc/pcre2_set_compile_extra_options.3
+src/libpcre2/doc/pcre2_set_compile_recursion_guard.3
+src/libpcre2/doc/pcre2_set_depth_limit.3
+src/libpcre2/doc/pcre2_set_glob_escape.3
+src/libpcre2/doc/pcre2_set_glob_separator.3
+src/libpcre2/doc/pcre2_set_heap_limit.3
+src/libpcre2/doc/pcre2_set_match_limit.3
+src/libpcre2/doc/pcre2_set_max_pattern_compiled_length.3
+src/libpcre2/doc/pcre2_set_max_pattern_length.3
+src/libpcre2/doc/pcre2_set_max_varlookbehind.3
+src/libpcre2/doc/pcre2_set_newline.3
+src/libpcre2/doc/pcre2_set_offset_limit.3
+src/libpcre2/doc/pcre2_set_optimize.3
+src/libpcre2/doc/pcre2_set_parens_nest_limit.3
+src/libpcre2/doc/pcre2_set_recursion_limit.3
+src/libpcre2/doc/pcre2_set_recursion_memory_management.3
+src/libpcre2/doc/pcre2_set_substitute_callout.3
+src/libpcre2/doc/pcre2_set_substitute_case_callout.3
+src/libpcre2/doc/pcre2_substitute.3
+src/libpcre2/doc/pcre2_substring_copy_byname.3
+src/libpcre2/doc/pcre2_substring_copy_bynumber.3
+src/libpcre2/doc/pcre2_substring_free.3
+src/libpcre2/doc/pcre2_substring_get_byname.3
+src/libpcre2/doc/pcre2_substring_get_bynumber.3
+src/libpcre2/doc/pcre2_substring_length_byname.3
+src/libpcre2/doc/pcre2_substring_length_bynumber.3
+src/libpcre2/doc/pcre2_substring_list_free.3
+src/libpcre2/doc/pcre2_substring_list_get.3
+src/libpcre2/doc/pcre2_substring_nametable_scan.3
+src/libpcre2/doc/pcre2_substring_number_from_name.3
+src/libpcre2/doc/pcre2api.3
+src/libpcre2/doc/pcre2build.3
+src/libpcre2/doc/pcre2callout.3
+src/libpcre2/doc/pcre2compat.3
+src/libpcre2/doc/pcre2convert.3
+src/libpcre2/doc/pcre2demo.3
+src/libpcre2/doc/pcre2grep.1
+src/libpcre2/doc/pcre2grep.txt
+src/libpcre2/doc/pcre2jit.3
+src/libpcre2/doc/pcre2limits.3
+src/libpcre2/doc/pcre2matching.3
+src/libpcre2/doc/pcre2partial.3
+src/libpcre2/doc/pcre2pattern.3
+src/libpcre2/doc/pcre2perform.3
+src/libpcre2/doc/pcre2posix.3
+src/libpcre2/doc/pcre2sample.3
+src/libpcre2/doc/pcre2serialize.3
+src/libpcre2/doc/pcre2syntax.3
+src/libpcre2/doc/pcre2test.1
+src/libpcre2/doc/pcre2test.txt
+src/libpcre2/doc/pcre2unicode.3
+src/libpcre2/doc/html/NON-AUTOTOOLS-BUILD.txt
+src/libpcre2/doc/html/README.txt
+src/libpcre2/doc/html/index.html
+src/libpcre2/doc/html/pcre2-config.html
+src/libpcre2/doc/html/pcre2.html
+src/libpcre2/doc/html/pcre2_callout_enumerate.html
+src/libpcre2/doc/html/pcre2_code_copy.html
+src/libpcre2/doc/html/pcre2_code_copy_with_tables.html
+src/libpcre2/doc/html/pcre2_code_free.html
+src/libpcre2/doc/html/pcre2_compile.html
+src/libpcre2/doc/html/pcre2_compile_context_copy.html
+src/libpcre2/doc/html/pcre2_compile_context_create.html
+src/libpcre2/doc/html/pcre2_compile_context_free.html
+src/libpcre2/doc/html/pcre2_config.html
+src/libpcre2/doc/html/pcre2_convert_context_copy.html
+src/libpcre2/doc/html/pcre2_convert_context_create.html
+src/libpcre2/doc/html/pcre2_convert_context_free.html
+src/libpcre2/doc/html/pcre2_converted_pattern_free.html
+src/libpcre2/doc/html/pcre2_dfa_match.html
+src/libpcre2/doc/html/pcre2_general_context_copy.html
+src/libpcre2/doc/html/pcre2_general_context_create.html
+src/libpcre2/doc/html/pcre2_general_context_free.html
+src/libpcre2/doc/html/pcre2_get_error_message.html
+src/libpcre2/doc/html/pcre2_get_mark.html
+src/libpcre2/doc/html/pcre2_get_match_data_heapframes_size.html
+src/libpcre2/doc/html/pcre2_get_match_data_size.html
+src/libpcre2/doc/html/pcre2_get_ovector_count.html
+src/libpcre2/doc/html/pcre2_get_ovector_pointer.html
+src/libpcre2/doc/html/pcre2_get_startchar.html
+src/libpcre2/doc/html/pcre2_jit_compile.html
+src/libpcre2/doc/html/pcre2_jit_free_unused_memory.html
+src/libpcre2/doc/html/pcre2_jit_match.html
+src/libpcre2/doc/html/pcre2_jit_stack_assign.html
+src/libpcre2/doc/html/pcre2_jit_stack_create.html
+src/libpcre2/doc/html/pcre2_jit_stack_free.html
+src/libpcre2/doc/html/pcre2_maketables.html
+src/libpcre2/doc/html/pcre2_maketables_free.html
+src/libpcre2/doc/html/pcre2_match.html
+src/libpcre2/doc/html/pcre2_match_context_copy.html
+src/libpcre2/doc/html/pcre2_match_context_create.html
+src/libpcre2/doc/html/pcre2_match_context_free.html
+src/libpcre2/doc/html/pcre2_match_data_create.html
+src/libpcre2/doc/html/pcre2_match_data_create_from_pattern.html
+src/libpcre2/doc/html/pcre2_match_data_free.html
+src/libpcre2/doc/html/pcre2_next_match.html
+src/libpcre2/doc/html/pcre2_pattern_convert.html
+src/libpcre2/doc/html/pcre2_pattern_info.html
+src/libpcre2/doc/html/pcre2_serialize_decode.html
+src/libpcre2/doc/html/pcre2_serialize_encode.html
+src/libpcre2/doc/html/pcre2_serialize_free.html
+src/libpcre2/doc/html/pcre2_serialize_get_number_of_codes.html
+src/libpcre2/doc/html/pcre2_set_bsr.html
+src/libpcre2/doc/html/pcre2_set_callout.html
+src/libpcre2/doc/html/pcre2_set_character_tables.html
+src/libpcre2/doc/html/pcre2_set_compile_extra_options.html
+src/libpcre2/doc/html/pcre2_set_compile_recursion_guard.html
+src/libpcre2/doc/html/pcre2_set_depth_limit.html
+src/libpcre2/doc/html/pcre2_set_glob_escape.html
+src/libpcre2/doc/html/pcre2_set_glob_separator.html
+src/libpcre2/doc/html/pcre2_set_heap_limit.html
+src/libpcre2/doc/html/pcre2_set_match_limit.html
+src/libpcre2/doc/html/pcre2_set_max_pattern_compiled_length.html
+src/libpcre2/doc/html/pcre2_set_max_pattern_length.html
+src/libpcre2/doc/html/pcre2_set_max_varlookbehind.html
+src/libpcre2/doc/html/pcre2_set_newline.html
+src/libpcre2/doc/html/pcre2_set_offset_limit.html
+src/libpcre2/doc/html/pcre2_set_optimize.html
+src/libpcre2/doc/html/pcre2_set_parens_nest_limit.html
+src/libpcre2/doc/html/pcre2_set_recursion_limit.html
+src/libpcre2/doc/html/pcre2_set_recursion_memory_management.html
+src/libpcre2/doc/html/pcre2_set_substitute_callout.html
+src/libpcre2/doc/html/pcre2_set_substitute_case_callout.html
+src/libpcre2/doc/html/pcre2_substitute.html
+src/libpcre2/doc/html/pcre2_substring_copy_byname.html
+src/libpcre2/doc/html/pcre2_substring_copy_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_free.html
+src/libpcre2/doc/html/pcre2_substring_get_byname.html
+src/libpcre2/doc/html/pcre2_substring_get_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_length_byname.html
+src/libpcre2/doc/html/pcre2_substring_length_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_list_free.html
+src/libpcre2/doc/html/pcre2_substring_list_get.html
+src/libpcre2/doc/html/pcre2_substring_nametable_scan.html
+src/libpcre2/doc/html/pcre2_substring_number_from_name.html
+src/libpcre2/doc/html/pcre2api.html
+src/libpcre2/doc/html/pcre2build.html
+src/libpcre2/doc/html/pcre2callout.html
+src/libpcre2/doc/html/pcre2compat.html
+src/libpcre2/doc/html/pcre2convert.html
+src/libpcre2/doc/html/pcre2demo.html
+src/libpcre2/doc/html/pcre2grep.html
+src/libpcre2/doc/html/pcre2jit.html
+src/libpcre2/doc/html/pcre2limits.html
+src/libpcre2/doc/html/pcre2matching.html
+src/libpcre2/doc/html/pcre2partial.html
+src/libpcre2/doc/html/pcre2pattern.html
+src/libpcre2/doc/html/pcre2perform.html
+src/libpcre2/doc/html/pcre2posix.html
+src/libpcre2/doc/html/pcre2sample.html
+src/libpcre2/doc/html/pcre2serialize.html
+src/libpcre2/doc/html/pcre2syntax.html
+src/libpcre2/doc/html/pcre2test.html
+src/libpcre2/doc/html/pcre2unicode.html
+src/libpcre2/m4/ax_check_vscript.m4
+src/libpcre2/m4/ax_pthread.m4
+src/libpcre2/m4/libtool.m4
+src/libpcre2/m4/ltoptions.m4
+src/libpcre2/m4/ltsugar.m4
+src/libpcre2/m4/ltversion.m4
+src/libpcre2/m4/lt~obsolete.m4
+src/libpcre2/m4/pcre2_visibility.m4
+src/libpcre2/m4/pcre2_zos.m4
+src/libpcre2/maint/.gitignore
+src/libpcre2/maint/132html
+src/libpcre2/maint/CheckMan
+src/libpcre2/maint/CheckTxt
+src/libpcre2/maint/CleanTxt
+src/libpcre2/maint/Detrail
+src/libpcre2/maint/FilterCoverage.py
+src/libpcre2/maint/GenerateCommon.py
+src/libpcre2/maint/GenerateTest.py
+src/libpcre2/maint/GenerateUcd.py
+src/libpcre2/maint/GenerateUcpHeader.py
+src/libpcre2/maint/GenerateUcpTables.py
+src/libpcre2/maint/LintMan
+src/libpcre2/maint/ManyConfigTests
+src/libpcre2/maint/README
+src/libpcre2/maint/RunCoverage
+src/libpcre2/maint/RunManifestTest
+src/libpcre2/maint/RunManifestTest.ps1
+src/libpcre2/maint/RunPerlTest
+src/libpcre2/maint/RunSymbolTest
+src/libpcre2/maint/RunSymbolTest.ps1
+src/libpcre2/maint/UpdateAlways
+src/libpcre2/maint/UpdateCommon.py
+src/libpcre2/maint/UpdateDates.py
+src/libpcre2/maint/UpdateRelease.py
+src/libpcre2/maint/manifest-cmakeinstall-freebsd
+src/libpcre2/maint/manifest-cmakeinstall-linux
+src/libpcre2/maint/manifest-cmakeinstall-macos
+src/libpcre2/maint/manifest-cmakeinstall-solaris
+src/libpcre2/maint/manifest-cmakeinstall-windows
+src/libpcre2/maint/manifest-libpcre2-16.so
+src/libpcre2/maint/manifest-libpcre2-32.so
+src/libpcre2/maint/manifest-libpcre2-8.so
+src/libpcre2/maint/manifest-libpcre2-posix.so
+src/libpcre2/maint/manifest-makeinstall-freebsd
+src/libpcre2/maint/manifest-makeinstall-linux
+src/libpcre2/maint/manifest-makeinstall-solaris
+src/libpcre2/maint/manifest-tarball
+src/libpcre2/maint/pcre2_chartables.c.non-standard
+src/libpcre2/maint/ucptest.c
+src/libpcre2/maint/utf8.c
+src/libpcre2/maint/Unicode.tables/BidiMirroring.txt
+src/libpcre2/maint/Unicode.tables/CaseFolding.txt
+src/libpcre2/maint/Unicode.tables/DerivedBidiClass.txt
+src/libpcre2/maint/Unicode.tables/DerivedCoreProperties.txt
+src/libpcre2/maint/Unicode.tables/DerivedGeneralCategory.txt
+src/libpcre2/maint/Unicode.tables/GraphemeBreakProperty.txt
+src/libpcre2/maint/Unicode.tables/PropList.txt
+src/libpcre2/maint/Unicode.tables/PropertyAliases.txt
+src/libpcre2/maint/Unicode.tables/PropertyValueAliases.txt
+src/libpcre2/maint/Unicode.tables/ScriptExtensions.txt
+src/libpcre2/maint/Unicode.tables/Scripts.txt
+src/libpcre2/maint/Unicode.tables/UnicodeData.txt
+src/libpcre2/maint/Unicode.tables/emoji-data.txt
+src/libpcre2/maint/cmake-tests/build-interface/CMakeLists.txt
+src/libpcre2/maint/cmake-tests/build-interface/main.c
+src/libpcre2/maint/cmake-tests/install-interface/CMakeLists.txt
+src/libpcre2/maint/cmake-tests/install-interface/main.c
+src/libpcre2/maint/ucptestdata/testinput1
+src/libpcre2/maint/ucptestdata/testinput2
+src/libpcre2/maint/ucptestdata/testoutput1
+src/libpcre2/maint/ucptestdata/testoutput2
+src/libpcre2/src/config-cmake.h.in
+src/libpcre2/src/config.h.generic
+src/libpcre2/src/config.h.in
+src/libpcre2/src/libpcre2-16.sym
+src/libpcre2/src/libpcre2-32.sym
+src/libpcre2/src/libpcre2-8.sym
+src/libpcre2/src/libpcre2-posix.sym
+src/libpcre2/src/pcre2.h.generic
+src/libpcre2/src/pcre2.h.in
+src/libpcre2/src/pcre2_auto_possess.c
+src/libpcre2/src/pcre2_chartables.c.dist
+src/libpcre2/src/pcre2_chartables.c.ebcdic-1047-nl15
+src/libpcre2/src/pcre2_chartables.c.ebcdic-1047-nl25
+src/libpcre2/src/pcre2_chkdint.c
+src/libpcre2/src/pcre2_compile.c
+src/libpcre2/src/pcre2_compile.h
+src/libpcre2/src/pcre2_compile_cgroup.c
+src/libpcre2/src/pcre2_compile_class.c
+src/libpcre2/src/pcre2_config.c
+src/libpcre2/src/pcre2_context.c
+src/libpcre2/src/pcre2_convert.c
+src/libpcre2/src/pcre2_dfa_match.c
+src/libpcre2/src/pcre2_dftables.c
+src/libpcre2/src/pcre2_error.c
+src/libpcre2/src/pcre2_extuni.c
+src/libpcre2/src/pcre2_find_bracket.c
+src/libpcre2/src/pcre2_fuzzsupport.c
+src/libpcre2/src/pcre2_internal.h
+src/libpcre2/src/pcre2_intmodedep.h
+src/libpcre2/src/pcre2_jit_char_inc.h
+src/libpcre2/src/pcre2_jit_compile.c
+src/libpcre2/src/pcre2_jit_match_inc.h
+src/libpcre2/src/pcre2_jit_misc_inc.h
+src/libpcre2/src/pcre2_jit_simd_inc.h
+src/libpcre2/src/pcre2_jit_test.c
+src/libpcre2/src/pcre2_maketables.c
+src/libpcre2/src/pcre2_match.c
+src/libpcre2/src/pcre2_match_data.c
+src/libpcre2/src/pcre2_match_next.c
+src/libpcre2/src/pcre2_newline.c
+src/libpcre2/src/pcre2_ord2utf.c
+src/libpcre2/src/pcre2_pattern_info.c
+src/libpcre2/src/pcre2_printint_inc.h
+src/libpcre2/src/pcre2_script_run.c
+src/libpcre2/src/pcre2_serialize.c
+src/libpcre2/src/pcre2_string_utils.c
+src/libpcre2/src/pcre2_study.c
+src/libpcre2/src/pcre2_substitute.c
+src/libpcre2/src/pcre2_substring.c
+src/libpcre2/src/pcre2_tables.c
+src/libpcre2/src/pcre2_ucd.c
+src/libpcre2/src/pcre2_ucp.h
+src/libpcre2/src/pcre2_ucptables_inc.h
+src/libpcre2/src/pcre2_util.h
+src/libpcre2/src/pcre2_valid_utf.c
+src/libpcre2/src/pcre2_xclass.c
+src/libpcre2/src/pcre2demo.c
+src/libpcre2/src/pcre2grep.c
+src/libpcre2/src/pcre2posix.c
+src/libpcre2/src/pcre2posix.h
+src/libpcre2/src/pcre2posix_test.c
+src/libpcre2/src/pcre2test.c
+src/libpcre2/src/pcre2test_inc.h
+src/libpcre2/testdata/grepbinary
+src/libpcre2/testdata/grepfilelist
+src/libpcre2/testdata/grepinput
+src/libpcre2/testdata/grepinput3
+src/libpcre2/testdata/grepinput8
+src/libpcre2/testdata/grepinputBad8
+src/libpcre2/testdata/grepinputBad8_Trail
+src/libpcre2/testdata/grepinputC.bz2
+src/libpcre2/testdata/grepinputC.gz
+src/libpcre2/testdata/grepinputM
+src/libpcre2/testdata/grepinputUN
+src/libpcre2/testdata/grepinputv
+src/libpcre2/testdata/grepinputx
+src/libpcre2/testdata/greplist
+src/libpcre2/testdata/greplistBad
+src/libpcre2/testdata/grepnot.bz2
+src/libpcre2/testdata/grepoutput
+src/libpcre2/testdata/grepoutput8
+src/libpcre2/testdata/grepoutputC
+src/libpcre2/testdata/grepoutputCN
+src/libpcre2/testdata/grepoutputCNU
+src/libpcre2/testdata/grepoutputCU
+src/libpcre2/testdata/grepoutputCbz2
+src/libpcre2/testdata/grepoutputCgz
+src/libpcre2/testdata/grepoutputN
+src/libpcre2/testdata/grepoutputUN
+src/libpcre2/testdata/greppatN4
+src/libpcre2/testdata/testbtables
+src/libpcre2/testdata/testinput1
+src/libpcre2/testdata/testinput10
+src/libpcre2/testdata/testinput11
+src/libpcre2/testdata/testinput12
+src/libpcre2/testdata/testinput13
+src/libpcre2/testdata/testinput14
+src/libpcre2/testdata/testinput15
+src/libpcre2/testdata/testinput16
+src/libpcre2/testdata/testinput17
+src/libpcre2/testdata/testinput18
+src/libpcre2/testdata/testinput19
+src/libpcre2/testdata/testinput2
+src/libpcre2/testdata/testinput20
+src/libpcre2/testdata/testinput21
+src/libpcre2/testdata/testinput22
+src/libpcre2/testdata/testinput23
+src/libpcre2/testdata/testinput24
+src/libpcre2/testdata/testinput25
+src/libpcre2/testdata/testinput26
+src/libpcre2/testdata/testinput27
+src/libpcre2/testdata/testinput28
+src/libpcre2/testdata/testinput29
+src/libpcre2/testdata/testinput3
+src/libpcre2/testdata/testinput4
+src/libpcre2/testdata/testinput5
+src/libpcre2/testdata/testinput6
+src/libpcre2/testdata/testinput7
+src/libpcre2/testdata/testinput8
+src/libpcre2/testdata/testinput9
+src/libpcre2/testdata/testinputheap
+src/libpcre2/testdata/testoutput1
+src/libpcre2/testdata/testoutput10
+src/libpcre2/testdata/testoutput11-16
+src/libpcre2/testdata/testoutput11-32
+src/libpcre2/testdata/testoutput12-16
+src/libpcre2/testdata/testoutput12-32
+src/libpcre2/testdata/testoutput13
+src/libpcre2/testdata/testoutput14-16
+src/libpcre2/testdata/testoutput14-32
+src/libpcre2/testdata/testoutput14-8
+src/libpcre2/testdata/testoutput15
+src/libpcre2/testdata/testoutput16
+src/libpcre2/testdata/testoutput17
+src/libpcre2/testdata/testoutput18
+src/libpcre2/testdata/testoutput19
+src/libpcre2/testdata/testoutput2
+src/libpcre2/testdata/testoutput20
+src/libpcre2/testdata/testoutput21
+src/libpcre2/testdata/testoutput22-16
+src/libpcre2/testdata/testoutput22-32
+src/libpcre2/testdata/testoutput22-8
+src/libpcre2/testdata/testoutput23
+src/libpcre2/testdata/testoutput24
+src/libpcre2/testdata/testoutput25
+src/libpcre2/testdata/testoutput26
+src/libpcre2/testdata/testoutput27
+src/libpcre2/testdata/testoutput28
+src/libpcre2/testdata/testoutput29
+src/libpcre2/testdata/testoutput3
+src/libpcre2/testdata/testoutput3A
+src/libpcre2/testdata/testoutput3B
+src/libpcre2/testdata/testoutput3C
+src/libpcre2/testdata/testoutput4
+src/libpcre2/testdata/testoutput5
+src/libpcre2/testdata/testoutput6
+src/libpcre2/testdata/testoutput7
+src/libpcre2/testdata/testoutput8-16-2
+src/libpcre2/testdata/testoutput8-16-4
+src/libpcre2/testdata/testoutput8-32-4
+src/libpcre2/testdata/testoutput8-8-2
+src/libpcre2/testdata/testoutput8-8-3
+src/libpcre2/testdata/testoutput8-8-4
+src/libpcre2/testdata/testoutput9
+src/libpcre2/testdata/testoutputheap-16
+src/libpcre2/testdata/testoutputheap-32
+src/libpcre2/testdata/testoutputheap-8
+src/libpcre2/testdata/valgrind-jit.supp
+src/libpcre2/testdata/wintestinput3
+src/libpcre2/testdata/wintestoutput3
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer.dict
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer.options
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer_16.dict
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer_16.options
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer_32.dict
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer_32.options
+src/libpcre2/vms/configure.com
+src/libpcre2/vms/openvms_readme.txt
+src/libpcre2/vms/pcre2.h_patch
+src/libpcre2/vms/stdint.h
+src/pcre2/CMakeLists.txt
+src/pcre2/__init__.py
+src/pcre2/_cy.pyx
+src/pcre2/_libpcre2.pxd
+src/pcre2.egg-info/PKG-INFO
+src/pcre2.egg-info/SOURCES.txt
+src/pcre2.egg-info/dependency_links.txt
+src/pcre2.egg-info/top_level.txt
+tests/test_groups.py
+tests/test_match.py
+tests/test_pattern.py
+tests/test_re_compatibility.py
\ No newline at end of file
diff --git a/src/pcre2.egg-info/dependency_links.txt b/src/pcre2.egg-info/dependency_links.txt
new file mode 100644 (file)
index 0000000..8b13789
--- /dev/null
@@ -0,0 +1 @@
+
diff --git a/src/pcre2.egg-info/top_level.txt b/src/pcre2.egg-info/top_level.txt
new file mode 100644 (file)
index 0000000..92d5e6d
--- /dev/null
@@ -0,0 +1 @@
+pcre2
diff --git a/src/pcre2/CMakeLists.txt b/src/pcre2/CMakeLists.txt
new file mode 100644 (file)
index 0000000..9508aee
--- /dev/null
@@ -0,0 +1,34 @@
+find_package(Cython MODULE REQUIRED)
+find_package(PythonExtensions MODULE REQUIRED)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+# Build Cython with annotations.
+set(CYTHON_ANNOTATE TRUE)
+
+# Macro to add Cython files as modules, configured to build with PCRE2.
+macro(add_pyx_file filename)
+    add_cython_target(${filename} C PY3)
+    add_library(${filename} MODULE ${filename})
+    python_extension_module(${filename})
+
+    target_link_libraries(${filename} pcre2-8-static)
+    target_include_directories(${filename} PRIVATE ${PCRE2_INCLUDE_DIR})
+    target_compile_options(${filename} PRIVATE ${CYTHON_EXTRA_COMPILE_ARGS})
+
+    install(TARGETS ${filename} LIBRARY DESTINATION src/pcre2)
+endmacro()
+
+# GLOB pattern is recommended against,
+# https://cmake.org/cmake/help/v3.14/command/file.html?highlight=file#filesystem
+add_pyx_file(_cy)
+
+
+# Include .pyx and .pxd files in distribution for use by Cython API.
+install(
+    FILES
+        _libpcre2.pxd
+        _cy.pyx
+    DESTINATION
+        src/pcre2
+)
\ No newline at end of file
diff --git a/src/pcre2/__init__.py b/src/pcre2/__init__.py
new file mode 100755 (executable)
index 0000000..732a764
--- /dev/null
@@ -0,0 +1,497 @@
+from . import _cy
+
+from enum import auto, IntFlag
+import operator
+from itertools import islice
+from functools import lru_cache, reduce
+from types import MappingProxyType
+from sys import maxsize
+
+# The below implementation uses as a base that of Google`s RE2 Python bindings:
+# https://github.com/google/re2/tree/main/python
+
+
+# ============================================================================
+#                                                                    Constants
+
+__version__ = "0.6.0"
+__libpcre2_version__ = _cy.__libpcre2_version__
+
+
+class RegexFlag(IntFlag):
+    # Flags either enable (True) or disable (False) PCRE2 options
+    NOFLAG = 0
+    IGNORECASE = _cy.CompileOption.CASELESS  # Ignore case
+    UNICODE = _cy.CompileOption.UTF  # Assume unicode "locale"
+    MULTILINE = _cy.CompileOption.MULTILINE  # Make anchors look for newline
+    DOTALL = _cy.CompileOption.DOTALL  # Make dot match newline
+    VERBOSE = _cy.CompileOption.EXTENDED  # Ignore whitespace and comments
+
+    # No corresponding flag in PCRE2, but is the opposite of `_cy.CompileOption.UCP`
+    ASCII = auto()  # ASCII-only matching for character classes
+
+
+NOFLAG = RegexFlag.NOFLAG
+ASCII = A = RegexFlag.ASCII
+IGNORECASE = I = RegexFlag.IGNORECASE
+UNICODE = U = RegexFlag.UNICODE
+MULTILINE = M = RegexFlag.MULTILINE
+DOTALL = S = RegexFlag.DOTALL
+VERBOSE = X = RegexFlag.VERBOSE
+
+
+LibraryError = _cy.LibraryError
+PatternError = error = _cy.PatternError
+
+
+# ============================================================================
+#                                                           Internal Utilities
+
+
+def _typeguard_strings(s):
+    if isinstance(s, str):
+        return str(s)
+    elif isinstance(s, (bytes, bytearray, memoryview)):
+        return bytes(s)
+    raise TypeError(f"Cannot process type {s}")
+
+
+# ============================================================================
+#                                                          Top-Level Functions
+
+
+def compile(pattern, flags=0, jit=True):
+    """
+    Compile a regular expression pattern, returning a Pattern object.
+    """
+    # Avoid recompilation if the pattern is already compiled with no option changes
+    if isinstance(pattern, Pattern):
+        if not flags == 0:
+            raise ValueError("Cannot process flags argument with a compiled pattern")
+        if pattern.jit == jit:
+            return pattern
+        # If options differ, extract the underlying string for recompilation
+        pattern = pattern.pattern
+
+    pattern = _typeguard_strings(pattern)
+    flags = RegexFlag(flags)
+
+    # Handle ASCII flag, defined as the disabling of the UCP PCRE2 option
+    options = flags & ~RegexFlag.ASCII
+    disabled_options = _cy.CompileOption.UCP if flags & RegexFlag.ASCII else 0
+
+    pcre2_code = _cy.compile(pattern, options, disabled_options)
+    if jit:
+        _cy.jit_compile(pcre2_code)
+    return Pattern(pcre2_code, pattern, flags, jit)
+
+
+def search(pattern, string, flags=0, jit=True):
+    """
+    Scan through `string` looking for a match to the pattern, returning a Match object, or None if
+    no match was found.
+    """
+    return compile(pattern, flags, jit).search(string)
+
+
+def match(pattern, string, flags=0, jit=True):
+    """
+    Match the pattern at the start of `string`, returning a Match object, or None if no match was
+    found.
+    """
+    return compile(pattern, flags, jit).match(string)
+
+
+def fullmatch(pattern, string, flags=0, jit=True):
+    """
+    Match the pattern to all of `string`, returning a Match object, or None if no match was found.
+    """
+    return compile(pattern, flags, jit).fullmatch(string)
+
+
+def finditer(pattern, string, flags=0, jit=True):
+    """
+    Return an iterator of Match objects for each non-overlapping match in the string.
+    """
+    return compile(pattern, flags, jit).finditer(string)
+
+
+def findall(pattern, string, flags=0, jit=True):
+    """
+    Return a list of all non-overlapping matches in `string`.
+
+    If one or more capture groups are present, return a list of groups for each match. Empty
+    matches are included in the result.
+    """
+    return compile(pattern, flags, jit).findall(string)
+
+
+def split(pattern, string, maxsplit=0, flags=0, jit=True):
+    """
+    Split the source string by the occurrences of the pattern, returning a list containing the
+    resulting substrings.
+
+    If capture groups are used in pattern, then the text of all groups are also returned. If
+    `maxsplit` is non-zero, at most `maxsplit` splits occur, and the remainder of `string` is
+    returned as the final element of the list.
+    """
+    return compile(pattern, flags, jit).split(string, maxsplit)
+
+
+def subn(pattern, repl, string, count=0, flags=0, jit=True):
+    """
+    Return a tuple containing `(res, number)`. `res` is the string obtained by replacing the
+    leftmost non-overlapping occurrences of the pattern in `string` by the replacement `repl`.
+    `number` is the number of substitutions that were made.
+
+    `repl` can be either a string or a callable. If it is a callable, it's passed the Match object
+    and must return a replacement string to be used.
+    """
+    return compile(pattern, flags, jit).subn(repl, string, count)
+
+
+def sub(pattern, repl, string, count=0, flags=0, jit=True):
+    """
+    Return the string obtained by replacing the leftmost non-overlapping occurrences of the pattern
+    in `string` by the replacement `repl`.
+
+    `repl` can be either a string or a callable. If it is a callable, it's passed the Match object
+    and must return a replacement string to be used.
+    """
+    return compile(pattern, flags, jit).sub(repl, string, count)
+
+
+# ============================================================================
+#                                                               Pattern Object
+
+
+class Pattern:
+    def __init__(self, pcre2_code, pattern, flags, jit):
+        if not isinstance(pcre2_code, _cy.PCRE2Code):
+            raise ValueError(
+                "PCRE2 code must be of type `_cy.PCRE2Code`. It is not recommended to instantiate "
+                "`Pattern` objects directly. Instead, use `pcre2.compile`."
+            )
+        self._pcre2_code = pcre2_code
+        self.pattern = pattern
+        self.flags = flags
+        self.jit = jit
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["_pcre2_code"]  # Remove the unpicklable pointer
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        # Note that patterns are recompiled - and optionally JIT compiled - when unpickling
+        self._pcre2_code = _cy.compile(self.pattern, self.flags)
+        if self.jit:
+            _cy.jit_compile(self._pcre2_code)
+
+    @property
+    @lru_cache(1)
+    def groups(self):
+        return _cy.pattern_capture_count(self._pcre2_code)
+
+    @property
+    @lru_cache(1)
+    def groupindex(self):
+        groupindex = _cy.pattern_name_dict(self._pcre2_code)
+        return MappingProxyType(groupindex)
+
+    def jit_compile(self):
+        """
+        JIT compile the pattern, or nothing if the pattern is already JIT compiled.
+        """
+        if not self.jit:
+            _cy.jit_compile(self._pcre2_code)
+            self.jit = True
+
+    def _match(self, string, pos=0, endpos=maxsize, options=0):
+        string = _typeguard_strings(string)
+        pos = max(0, min(pos, len(string)))
+        endpos = max(0, min(endpos, len(string)))
+        match_data, match_byte_offset, match_options = _cy.match(
+            self._pcre2_code, string, endpos, pos, options
+        )
+        if match_data:
+            return Match(match_data, self, string, pos, endpos, match_byte_offset, match_options)
+        return None
+
+    def search(self, string, pos=0, endpos=maxsize):
+        """
+        Scan through `string` looking for a match to the pattern, returning a Match object, or None
+        if no match was found.
+        """
+        return self._match(string, pos, endpos)
+
+    def match(self, string, pos=0, endpos=maxsize):
+        """
+        Match the pattern at the start of `string`, returning a Match object, or None if no match
+        was found.
+        """
+        return self._match(string, pos, endpos, options=_cy.MatchOption.ANCHORED)
+
+    def fullmatch(self, string, pos=0, endpos=maxsize):
+        """
+        Match the pattern to all of `string`, returning a Match object, or None if no match was
+        found.
+        """
+        options = _cy.MatchOption.ANCHORED | _cy.MatchOption.ENDANCHORED
+        return self._match(string, pos, endpos, options=options)
+
+    def finditer(self, string, pos=0, endpos=maxsize):
+        """
+        Return an iterator of Match objects for each non-overlapping match in the string.
+        """
+        string = _typeguard_strings(string)
+        pos = max(0, min(pos, len(string)))
+        endpos = max(0, min(endpos, len(string)))
+        for match_data, match_byte_offset, match_options in _cy.match_generator(
+            self._pcre2_code, string, endpos, pos
+        ):
+            yield Match(match_data, self, string, pos, endpos, match_byte_offset, match_options)
+
+    def findall(self, string, pos=0, endpos=maxsize):
+        """
+        Return a list of all non-overlapping matches in `string`.
+
+        If one or more capture groups are present, return a list of groups for each match. Empty
+        matches are included in the result.
+        """
+        string = _typeguard_strings(string)
+        empty = type(string)()
+        items = []
+        for match in self.finditer(string, pos, endpos):
+            if not self.groups:
+                item = match.group()
+            elif self.groups == 1:
+                item = match.groups(default=empty)[0]
+            else:
+                item = match.groups(default=empty)
+            items.append(item)
+        return items
+
+    def split(self, string, maxsplit=0):
+        """
+        Split the source string by the occurrences of the pattern, returning a list containing the
+        resulting substrings.
+
+        If capture groups are used in pattern, then the text of all groups are also returned. If
+        `maxsplit` is non-zero, at most `maxsplit` splits occur, and the remainder of `string` is
+        returned as the final element of the list.
+        """
+        string = _typeguard_strings(string)
+        if maxsplit < 0:
+            return [string]
+        parts = []
+        start = 0
+        for match in islice(self.finditer(string), maxsplit or None):
+            parts.append(string[start : match.start()])
+            parts.extend(map(match.__getitem__, range(1, self.groups + 1)))
+            start = match.end()
+        parts.append(string[start:])
+        return parts
+
+    def _suball(self, template, string):
+        template = _typeguard_strings(template)
+        string = _typeguard_strings(string)
+        options = _cy.SubstituteOption.GLOBAL | _cy.SubstituteOption.UNSET_EMPTY
+        byte_offset = 0
+        return _cy.substitute(self._pcre2_code, template, string, byte_offset, options=options)
+
+    def subn(self, repl, string, count=0):
+        """
+        Return a tuple containing `(res, number)`. `res` is the string obtained by replacing the
+        leftmost non-overlapping occurrences of the pattern in `string` by the replacement `repl`.
+        `number` is the number of substitutions that were made.
+
+        `repl` can be either a string or a callable. If it is a callable, it's passed the Match
+        object and must return a replacement string to be used.
+        """
+        string = _typeguard_strings(string)
+        if count < 0:
+            return (string, 0)
+
+        # Short circuit for global substitute
+        if count == 0 and not callable(repl):
+            return self._suball(repl, string)
+
+        parts = []
+        empty = type(string)()
+
+        # Pure python needed to apply callback functions
+        if callable(repl):
+            start = 0
+            numsubs = 0
+            for match in islice(self.finditer(string), count or None):
+                parts.append(string[start : match.start()])
+                parts.append(repl(match))
+                start = match.end()
+                numsubs += 1
+            parts.append(string[start:])
+            empty = type(string)()
+            return empty.join(parts), numsubs
+        else:
+            # Iterate through matches to get index of last match
+            repl = _typeguard_strings(repl)
+            end = 0
+            for match in islice(self.finditer(string), count or None):
+                end = match.end()
+            expanded, numsubs = self._suball(repl, string[:end])
+            parts = [expanded, string[end:]]
+
+        return empty.join(parts), numsubs
+
+    def sub(self, repl, string, count=0):
+        """
+        Return the string obtained by replacing the leftmost non-overlapping occurrences of the
+        pattern in `string` by the replacement `repl`.
+
+        `repl` can be either a string or a callable. If it is a callable, it's passed the Match
+        object and must return a replacement string to be used.
+        """
+        return self.subn(repl, string, count)[0]
+
+
+# ============================================================================
+#                                                                 Match Object
+
+
+class Match:
+    def __init__(self, pcre2_match_data, re, string, pos, endpos, byte_offset, options):
+        if not isinstance(pcre2_match_data, _cy.PCRE2MatchData):
+            raise ValueError(
+                "PCRE2 match data must be of type `_cy.PCRE2MatchData`. It is not recommended to "
+                "instantiate `Match` objects directly. Instead, use `Pattern.match`."
+            )
+        self._pcre2_match_data = pcre2_match_data
+        self.re = re
+        self.string = string
+        self.pos = pos
+        self.endpos = endpos
+        self._byte_offset = byte_offset
+        self._options = options
+
+    def __repr__(self):
+        return (
+            f"<{self.__class__.__module__}.{self.__class__.__qualname__} object; "
+            f"span={self.span()}, match={repr(self.group())}>"
+        )
+
+    def _groupguard(self, group):
+        if isinstance(group, int):
+            if not 0 <= group <= self.re.groups:
+                raise IndexError("No such group")
+            group_number = group
+        elif isinstance(group, str):
+            if group not in self.re.groupindex:
+                raise IndexError("no such group")
+            group_number = self.re.groupindex[group]
+        elif hasattr(group, "__index__"):
+            group_number = int(group.__index__())
+        else:
+            raise IndexError("No such group")
+        return group_number
+
+    def expand(self, template):
+        """
+        Return the string obtained by substitution on the template string `template`.
+        """
+        template = _typeguard_strings(template)
+        options = (
+            self._options | _cy.SubstituteOption.REPLACEMENT_ONLY | _cy.SubstituteOption.UNSET_EMPTY
+        )
+        res, _ = _cy.substitute(
+            self.re._pcre2_code,
+            template,
+            self.string,
+            self._byte_offset,
+            options=options,
+            match_data=self._pcre2_match_data,
+        )
+        return res
+
+    def span(self, group=0):
+        """
+        Return the start and end of `group` as the tuple `(start, end)`.
+
+        If `group` did not contribute to the match, `(-1, -1)` is returned.
+        """
+        group_number = self._groupguard(group)
+        return _cy.substring_span_bynumber(self._pcre2_match_data, self.string, group_number)
+
+    def __getitem__(self, group):
+        group_number = self._groupguard(group)
+        return _cy.substring_bynumber(self._pcre2_match_data, self.string, group_number)
+
+    def group(self, *groups):
+        """
+        Returns one or more subgroups of the match.
+
+        If there is a single argument, the result is a single string. If there are multiple
+        arguments, the result is a tuple with one item per argument. Without arguments, the whole
+        match is returned.
+        """
+        if not groups:
+            groups = (0,)
+        items = map(self.__getitem__, groups)
+        return next(items) if len(groups) == 1 else tuple(items)
+
+    def groups(self, default=None):
+        """
+        Return a tuple containing all the subgroups of the match.
+        """
+        items = []
+        for group in range(1, self.re.groups + 1):
+            item = self.__getitem__(group)
+            items.append(default if item is None else item)
+        return tuple(items)
+
+    def groupdict(self, default=None):
+        """
+        Return a dictionary mapping subgroup name to group number for all the named subgroups.
+        """
+        items = []
+        for group, index in self.re.groupindex.items():
+            item = self.__getitem__(index)
+            items.append((group, default) if item is None else (group, item))
+        return dict(items)
+
+    def start(self, group=0):
+        """
+        Return the start index of the substring matched by `group`.
+        """
+        return self.span(group)[0]
+
+    def end(self, group=0):
+        """
+        Return the end index of the substring matched by `group`.
+        """
+        return self.span(group)[1]
+
+    @property
+    @lru_cache(1)
+    def lastindex(self):
+        max_end = -1
+        max_group = None
+        # We look for the rightmost right parenthesis by keeping the first group that ends at
+        # max_end because that is the leftmost/outermost group when there are nested groups!
+        for group in range(1, self.re.groups + 1):
+            end = self.end(group)
+            if max_end < end:
+                max_end = end
+                max_group = group
+        return max_group
+
+    @property
+    @lru_cache(1)
+    def lastgroup(self):
+        max_group = self.lastindex
+        if not max_group:
+            return None
+        for group, index in self.re.groupindex.items():
+            if max_group == index:
+                return group
+        return None
diff --git a/src/pcre2/_cy.pyx b/src/pcre2/_cy.pyx
new file mode 100644 (file)
index 0000000..8993782
--- /dev/null
@@ -0,0 +1,590 @@
+# -*- coding:utf-8 -*-
+# cython: profile=True
+
+from libc.stdint cimport uint8_t, uint32_t
+from libc.stdlib cimport malloc, free
+from libc.string cimport strlen
+from cpython.unicode cimport PyUnicode_Check, PyUnicode_AsUTF8AndSize
+from cpython.bytes cimport PyBytes_Check, PyBytes_AsStringAndSize
+
+from _libpcre2 cimport *
+
+from enum import IntFlag
+
+
+__libpcre2_version__ = f"{PCRE2_MAJOR}.{PCRE2_MINOR}"
+
+
+# ============================================================================
+#                                                              Pointer Proxies
+
+# Pointer wrappers to manage lifetime and expose to Python code
+cdef class PCRE2Code:
+    cdef pcre2_code_t *ptr
+    cdef bint _pattern_is_str
+
+    @staticmethod
+    cdef PCRE2Code from_ptr(pcre2_code_t *ptr, bint pattern_is_str):
+        """ Ownership of pointer is taken by the new instance """
+        cdef PCRE2Code code
+        code = PCRE2Code.__new__(PCRE2Code)
+        code.ptr = ptr
+        code._pattern_is_str = pattern_is_str
+        return code
+
+    def __init__(self, *args, **kwargs):
+        # Prevent accidental instantiation from normal Python code
+        raise TypeError(f"Cannot create 'PCRE2Code' instances")
+
+    def __dealloc__(self):
+        if self.ptr is not NULL:
+            pcre2_code_free(self.ptr)
+
+
+cdef class PCRE2MatchData:
+    cdef pcre2_match_data_t *ptr
+
+    @staticmethod
+    cdef PCRE2MatchData from_ptr(pcre2_match_data_t *ptr):
+        """ Ownership of pointer is always taken by the new instance """
+        cdef PCRE2MatchData match_data
+        match_data = PCRE2MatchData.__new__(PCRE2MatchData)
+        match_data.ptr = ptr
+        return match_data
+
+    def __init__(self, *args, **kwargs):
+        # Prevent accidental instantiation from normal Python code
+        raise TypeError(f"Cannot create 'PCRE2MatchData' instances")
+
+    def __dealloc__(self):
+        if self.ptr is not NULL:
+            pcre2_match_data_free(self.ptr)
+
+
+# ============================================================================
+#                                                            Buffer Aquisition
+
+cdef (uint8_t *, size_t) as_sptr_and_size(object obj) except *:
+    cdef:
+        int rc
+        char *sptr = NULL
+        Py_ssize_t length = 0
+
+    # Encode unicode strings as UTF-8 buffers
+    if PyUnicode_Check(obj):
+        sptr = <char *>PyUnicode_AsUTF8AndSize(obj, &length)
+        assert(sptr is not NULL) # The function is supposed to throw on errors
+    elif PyBytes_Check(obj):
+        rc = PyBytes_AsStringAndSize(obj, &sptr, &length)
+        assert(rc == 0)
+    else:
+        raise ValueError("Only objects of type 'str' and 'bytes' are supported")
+    return <uint8_t *>sptr, length
+
+
+# ============================================================================
+#                                                             Unicode Indexing
+
+cdef size_t idx_byte_to_char(
+    uint8_t *sptr, size_t byte_idx, size_t start_byte_idx = 0, size_t start_char_idx = 0
+):
+    cdef:
+        size_t cur_byte_idx = start_byte_idx
+        size_t cur_char_idx = start_char_idx
+
+    while cur_byte_idx < byte_idx:
+        if (sptr[cur_byte_idx] & 0xC0) != 0x80:
+            cur_char_idx += 1
+        cur_byte_idx += 1
+
+    return cur_char_idx
+
+
+cdef size_t idx_char_to_byte(
+    uint8_t *sptr, size_t sptr_size,
+    size_t char_idx,
+    size_t start_byte_idx = 0,
+    size_t start_char_idx = 0,
+):
+    cdef:
+        size_t cur_byte_idx = start_byte_idx
+        size_t cur_char_idx = start_char_idx
+
+    if cur_char_idx < char_idx:
+        while cur_char_idx < char_idx:
+            if (sptr[cur_byte_idx] & 0xC0) != 0x80:
+                cur_char_idx += 1
+            cur_byte_idx += 1
+
+        while cur_byte_idx < sptr_size and (sptr[cur_byte_idx] & 0xC0) == 0x80:
+            cur_byte_idx += 1
+
+    return cur_byte_idx
+
+
+# ============================================================================
+#                                                                   Exceptions
+
+class LibraryError(Exception):
+    def __init__(self, int errcode, object ctxmsg = None):
+        cdef:
+            uint8_t errmsg_sptr[120]
+            int rc
+
+        rc = pcre2_get_error_message(errcode, errmsg_sptr, sizeof(errmsg_sptr))
+        if rc == PCRE2_ERROR_NOMEMORY:
+            raise MemoryError
+        elif rc == PCRE2_ERROR_BADDATA:
+            raise ValueError(f"Unrecognized PCRE2 error code {errcode}")
+        elif rc < 0:
+            raise RuntimeError(f"Unhandled error code {rc} raised when getting error message")
+
+        # For non-negative values, return code is the length of the message
+        errmsg = errmsg_sptr[:rc].decode("UTF-8")
+        if ctxmsg:
+            errmsg = f"{ctxmsg}; {errmsg}"
+
+        super().__init__(errmsg)
+        self.msg = errmsg
+        self.code = errcode
+
+
+class PatternError(LibraryError):
+    def __init__(self, int errcode, errpos):
+        super().__init__(errcode, ctxmsg=f"compilation failed at position {errpos}")
+        self.pos = errpos
+
+
+cdef inline void raise_from_rc(int rc):
+    if rc < 0:
+        raise LibraryError(rc)
+
+
+# ============================================================================
+#                                                          Pattern Compilation
+
+
+class CompileOption(IntFlag):
+    CASELESS = PCRE2_CASELESS
+    DOTALL = PCRE2_DOTALL
+    MULTILINE = PCRE2_MULTILINE
+    EXTENDED = PCRE2_EXTENDED
+
+    # Controls the input codec (whether the input bytes are read into characters by UTF-8
+    # decoding). If the input pattern is a `str`, the default behaviour is UNICODE (and this cannot
+    # be unset). If the input pattern is a `bytes`, the default is ASCII/Latin-1 (one byte per
+    # character), but UNICODE sets this to UTF-8.
+    UTF = PCRE2_UTF
+
+    # Controls the interpretation of character values. If characters are ASCII, then (for example)
+    # '\w' does not match values outside the range 0-127. If the input pattern is a compiled with
+    # the `UTF` option (whether `str` or `bytes`), the default behaviour is `UCP` enabled; this can
+    # be disabled by the `ASCII` flag in the Python wrapper
+    UCP = PCRE2_UCP
+
+
+def compile(object pattern, uint32_t options = 0, disabled_options = 0):
+    cdef:
+        pcre2_code_t *code
+        uint8_t *patn_sptr
+        size_t patn_size
+        int rc
+        size_t errpos
+
+    # Get views into object memory
+    patn_sptr, patn_size = as_sptr_and_size(pattern)
+
+    # Lock out the use of \C which can lead to patterns matching within characters
+    options = options | PCRE2_NEVER_BACKSLASH_C
+
+    # Set Python style '\uhhhh' syntax for literal unicode characters
+    options = options | PCRE2_ALT_BSUX
+
+    # Default to UNICODE and UNICODE_PROPS for 'str' patterns and always disable these options for
+    # 'bytes' patterns
+    if PyUnicode_Check(pattern):
+        options = options | PCRE2_UTF
+
+    # Always default to Unicode property support if we are interpreting strings as Unicode for both
+    # 'str' and 'bytes' objects
+    if options & PCRE2_UTF:
+        options = options | PCRE2_UCP
+
+    # Allow for disabling any of the options set
+    options = options & ~disabled_options
+
+    code = pcre2_compile(patn_sptr, patn_size, options, &rc, &errpos, NULL)
+    if code is NULL:
+        if PyUnicode_Check(pattern):
+            errpos = idx_byte_to_char(patn_sptr, errpos)
+
+        # For some errors (e.g., unclosed groups) the whole pattern must be scanned and the error
+        # position returned is the length of the string. This means that the total range of error
+        # offset values is [0, length] inclusive
+        raise PatternError(rc, errpos)
+
+    return PCRE2Code.from_ptr(code, PyUnicode_Check(pattern))
+
+
+def jit_compile(PCRE2Code code not None):
+    raise_from_rc(pcre2_jit_compile(code.ptr, PCRE2_JIT_COMPLETE))
+
+
+# ============================================================================
+#                                                       Information Extraction
+
+def pattern_is_utf(PCRE2Code code not None):
+    cdef uint32_t all_options
+    raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_ALLOPTIONS, &all_options))
+    return bool(all_options & PCRE2_UTF)
+
+
+def pattern_capture_count(PCRE2Code code not None):
+    cdef uint32_t capture_count
+    raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_CAPTURECOUNT, &capture_count))
+    return int(capture_count)
+
+
+def pattern_name_dict(PCRE2Code code not None):
+    cdef:
+        const uint8_t *name_table
+        const uint8_t *name
+        uint32_t name_count, name_entry_size
+        int idx, offset
+        object encoding
+
+    # Get name table related information
+    raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_NAMECOUNT, &name_count))
+    raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size))
+    raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_NAMETABLE, &name_table))
+
+    encoding = "UTF-8" if pattern_is_utf(code) else "Latin-1"
+
+    # Convert byte table to dictionary mapping group names to numbers
+    name_dict = {}
+    for idx in range(name_count):
+        # Name table is structured with first two bytes of name table contain group number followed
+        # by name string (which can be assumed to be in Latin-1 for non-unicode patterns). Default
+        # builds of PCRE2 only allow ASCII character names.
+        offset = idx * name_entry_size
+        name = &name_table[offset + 2]
+        group_name = name[:strlen(<const char *>name)].decode(encoding)
+        group_number = int((name_table[offset] << 8) | name_table[offset + 1])
+        name_dict[group_name] = group_number
+
+    return name_dict
+
+
+def substring_span_bynumber(PCRE2MatchData match_data not None, object subject, size_t number):
+    cdef:
+        size_t *ovector
+        uint8_t *subj_sptr
+        size_t subj_size
+        int rc
+        size_t start
+        size_t end
+
+    # Get views into object memory
+    subj_sptr, subj_size = as_sptr_and_size(subject)
+
+    # Only perform offset lookup if group has been set
+    rc = pcre2_substring_length_bynumber(match_data.ptr, number, NULL)
+    if rc == 0:
+        ovector = pcre2_get_ovector_pointer(match_data.ptr)
+        start = ovector[2 * number]
+        end = ovector[2 * number + 1]
+
+        if PyUnicode_Check(subject):
+            start = idx_byte_to_char(subj_sptr, start)
+            end = idx_byte_to_char(subj_sptr, end)
+
+        return (start, end)
+
+    return (-1, -1)
+
+
+def substring_bynumber(PCRE2MatchData match_data not None, object subject, size_t number):
+    cdef:
+        size_t *ovector
+        uint8_t *subj_sptr
+        size_t subj_size
+        int rc
+        size_t start
+        size_t end
+
+    # Get views into object memory
+    subj_sptr, subj_size = as_sptr_and_size(subject)
+
+    # Only perform offset lookup if group has been set
+    rc = pcre2_substring_length_bynumber(match_data.ptr, number, NULL)
+    if rc == PCRE2_ERROR_UNSET:
+        return None
+    raise_from_rc(rc)
+
+    ovector = pcre2_get_ovector_pointer(match_data.ptr)
+    start = ovector[2 * number]
+    end = ovector[2 * number + 1]
+
+    res_obj = bytes(subj_sptr[start:end])
+    if PyUnicode_Check(subject):
+        res_obj = res_obj.decode("UTF-8")
+    return res_obj
+
+
+# ============================================================================
+#                                                                     Matching
+
+class MatchOption(IntFlag):
+    ANCHORED = PCRE2_ANCHORED
+    ENDANCHORED = PCRE2_ENDANCHORED
+
+cdef pcre2_match_data_t * _pcre2_match_data_create_from_pattern(
+    const pcre2_code_t *code, pcre2_general_context_t *gcontext
+):
+    return pcre2_match_data_create_from_pattern(code, gcontext)
+
+cdef int _pcre2_match(
+    const pcre2_code_t *code,
+    pcre2_sptr_t subject,
+    size_t length,
+    size_t startoffset,
+    uint32_t options,
+    pcre2_match_data_t *match_data,
+    pcre2_match_context_t *mcontext
+):
+    return pcre2_match(code, subject, length, startoffset, options, match_data, mcontext)
+
+cdef PCRE2MatchData _match(
+    PCRE2Code code,
+    uint8_t *subj_sptr,
+    size_t byte_length,
+    size_t byte_offset,
+    uint32_t options,
+) except *:
+    cdef:
+        pcre2_match_data_t *match_data_ptr
+        int rc
+
+    # Allocate memory for match data, returning NULL if the memory could not be obtained
+    match_data_ptr = _pcre2_match_data_create_from_pattern(code.ptr, NULL)
+    if match_data_ptr is NULL:
+        raise MemoryError
+
+    # Attempt match of pattern onto the subject
+    rc = _pcre2_match(code.ptr, subj_sptr, byte_length, byte_offset, options, match_data_ptr, NULL)
+    if rc == PCRE2_ERROR_NOMATCH:
+        return None
+    raise_from_rc(rc)
+
+    return PCRE2MatchData.from_ptr(match_data_ptr)
+
+def match(
+    PCRE2Code code not None,
+    object subject,
+    size_t length, # length & offset in logical (index) units
+    size_t offset,
+    uint32_t options = 0,
+):
+    cdef:
+        uint8_t *subj_sptr
+        size_t subj_size
+
+    # Although the error message says "cannot use..." there would actually be nothing wrong at all
+    # with removing this block and allowing it. It's simply a matter of policy and clarity, and to
+    # match Python's re module.
+    if code._pattern_is_str ^ PyUnicode_Check(subject):
+        if code._pattern_is_str:
+            raise TypeError("Cannot use a string pattern on a bytes-like object")
+        else:
+            raise TypeError("Cannot use a bytes pattern on a string-like object")
+
+    # Get views into object memory
+    subj_sptr, subj_size = as_sptr_and_size(subject)
+
+    if PyUnicode_Check(subject):
+        # Disable UTF-8 encoding checks for improved performance
+        options |= PCRE2_NO_UTF_CHECK
+
+        length = (
+            subj_size if length == len(subject) else idx_char_to_byte(subj_sptr, subj_size, length)
+        )
+        offset = (
+            subj_size if offset == len(subject) else idx_char_to_byte(subj_sptr, subj_size, offset)
+        )
+
+    return _match(code, subj_sptr, length, offset, options), offset, options
+
+
+def match_generator(
+    PCRE2Code code not None,
+    object subject,
+    size_t length, # length & offset in logical (index) units
+    size_t offset,
+):
+    cdef:
+        uint32_t starting_options = 0
+        uint32_t state_options = 0
+        uint32_t match_options
+        size_t byte_length = length
+        size_t byte_offset = offset
+        size_t match_byte_offset
+
+    # Although the error message says "cannot use..." there would actually be nothing wrong at all
+    # with removing this block and allowing it. It's simply a matter of policy and clarity, and to
+    # match Python's re module.
+    if code._pattern_is_str ^ PyUnicode_Check(subject):
+        if code._pattern_is_str:
+            raise TypeError("Cannot use a string pattern on a bytes-like object")
+        else:
+            raise TypeError("Cannot use a bytes pattern on a string-like object")
+
+    # Get views into object memory
+    subj_sptr, subj_size = as_sptr_and_size(subject)
+
+    if PyUnicode_Check(subject):
+        # Disable UTF-8 encoding checks for improved performance
+        starting_options |= PCRE2_NO_UTF_CHECK
+
+        byte_length = (
+            subj_size if length == len(subject) else idx_char_to_byte(subj_sptr, subj_size, length)
+        )
+        byte_offset = (
+            subj_size if offset == len(subject) else idx_char_to_byte(subj_sptr, subj_size, offset)
+        )
+
+    while byte_offset <= byte_length:
+        match_options = starting_options | state_options
+        match_byte_offset = byte_offset
+        match_data = _match(code, subj_sptr, byte_length, match_byte_offset, match_options)
+        if not match_data:
+            break
+
+        else:
+            ovector = pcre2_get_ovector_pointer(match_data.ptr)
+
+            assert(match_byte_offset <= ovector[0] and ovector[0] <= ovector[1])
+            assert(ovector[1] > match_byte_offset or state_options == 0)
+
+            if ovector[0] == ovector[1]:
+                # If the matched string is empty ensure the next match makes progress
+                state_options = PCRE2_NOTEMPTY_ATSTART
+            else:
+                state_options = 0  # Reset options so empty strings can match at next offset
+
+            byte_offset = ovector[1]
+
+            yield match_data, match_byte_offset, match_options
+
+            # No need to re-match after an empty match at the end (it will just find nothing)
+            if ovector[0] == ovector[1] and ovector[1] >= byte_length:
+                break
+
+
+# ============================================================================
+#                                                                 Substitution
+
+
+class SubstituteOption(IntFlag):
+    GLOBAL = PCRE2_SUBSTITUTE_GLOBAL
+    UNSET_EMPTY = PCRE2_SUBSTITUTE_UNSET_EMPTY
+    REPLACEMENT_ONLY = PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
+
+def substitute(
+    PCRE2Code code not None,
+    object replacement,
+    object subject,
+    size_t byte_offset, # in bytes - unlike _cy.match()
+    uint32_t options = 0,
+    PCRE2MatchData match_data = None,
+):
+    cdef:
+        int rc
+        pcre2_match_data_t *match_data_ptr = NULL
+        uint8_t *subj_sptr
+        uint8_t *repl_sptr
+        uint8_t *res_sptr
+        size_t subj_size, repl_size, res_size
+
+    # Always compute the needed length if there is any overflow
+    options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
+
+    # Add support for backslash escape characters and Python substitution forms
+    options |= PCRE2_SUBSTITUTE_EXTENDED
+
+    # Although the error message says "cannot use..." there would actually be nothing wrong at all
+    # with removing this block and allowing it. It's simply a matter of policy and clarity, and to
+    # match Python's re module.
+    if code._pattern_is_str ^ PyUnicode_Check(subject):
+        if code._pattern_is_str:
+            raise TypeError("Cannot use a string pattern on a bytes-like object")
+        else:
+            raise TypeError("Cannot use a bytes pattern on a string-like object")
+
+    # Similarly, ensure that there is a match between the type of subject and replacement.
+    #
+    # Unlike the check that pattern and subject match, this one is cannot be simply removed. We
+    # pass in the PCRE2_NO_UTF_CHECK flag based on the type of subject, and that flag also affects
+    # the interpretation of replacement. So, we require a check that the replacement string is
+    # valid UTF-8, if the subject is a 'str' object (note that we could do this either by enforcing
+    # that replacement is a 'str', or by we could allow bytes as well if we do the decode here to
+    # validate it).
+    #
+    # For policy and clarity, we additionally forbid using a 'str' replacement with a 'bytes'
+    # subject, although there is no issue with that combination.
+    if PyUnicode_Check(subject) ^ PyUnicode_Check(replacement):
+        if PyUnicode_Check(subject):
+            raise TypeError("Cannot use a string subject with a bytes-like template")
+        else:
+            raise TypeError("Cannot use a bytes subject with a string-like template")
+
+    # Get views into object memory
+    repl_sptr, repl_size = as_sptr_and_size(replacement)
+    subj_sptr, subj_size = as_sptr_and_size(subject)
+
+    # Disable UTF-8 encoding checks for improved performance
+    if match_data is None and PyUnicode_Check(subject):
+        options |= PCRE2_NO_UTF_CHECK
+
+    if match_data is not None:
+        match_data_ptr = match_data.ptr
+        options |= PCRE2_SUBSTITUTE_MATCHED
+
+    # Make simple attempt at guess for required memory, unless match has already been made
+    res_size = subj_size + (subj_size // 2) if match_data is None else 0
+    res_sptr = <uint8_t *>malloc(res_size * sizeof(uint8_t))
+    try:
+        rc = pcre2_substitute(
+            code.ptr,
+            subj_sptr, subj_size,
+            byte_offset,
+            options,
+            match_data_ptr,
+            NULL,
+            repl_sptr, repl_size,
+            res_sptr, &res_size,
+        )
+        # Reattempt substitution if no memory, now with required size of buffer known
+        if rc == PCRE2_ERROR_NOMEMORY:
+            free(res_sptr)
+            res_sptr = <uint8_t *>malloc(res_size * sizeof(uint8_t))
+            rc = pcre2_substitute(
+                code.ptr,
+                subj_sptr, subj_size,
+                byte_offset,
+                options,
+                match_data_ptr,
+                NULL,
+                repl_sptr, repl_size,
+                res_sptr, &res_size,
+            )
+        raise_from_rc(rc)
+
+        # Non-error return code contains the number of substitutions made
+        res_obj = bytes(res_sptr[:res_size])
+        if PyUnicode_Check(subject):
+            # Match the type of the return object to the input object
+            res_obj = res_obj.decode("UTF-8")
+        return (res_obj, rc)
+
+    finally:
+        free(res_sptr)
diff --git a/src/pcre2/_libpcre2.pxd b/src/pcre2/_libpcre2.pxd
new file mode 100755 (executable)
index 0000000..5f299c7
--- /dev/null
@@ -0,0 +1,500 @@
+# -*- coding:utf-8 -*-
+
+from libc.stdint cimport uint8_t, uint32_t, int32_t
+
+
+cdef extern from "pcre2.h":
+    cdef unsigned int PCRE2_MAJOR
+    cdef unsigned int PCRE2_MINOR
+
+    # The following option bits can be passed to pcre2_compile(),
+    # pcre2_match(), or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the
+    # function to which it is passed. Put these bits at the most significant
+    # end of the options word so others can be added next to them.
+    cdef unsigned int PCRE2_ANCHORED
+    cdef unsigned int PCRE2_NO_UTF_CHECK
+    cdef unsigned int PCRE2_ENDANCHORED
+
+    # The following option bits can be passed only to pcre2_compile(). However,
+    # they may affect compilation, JIT compilation, and/or interpretive
+    # execution. The following tags indicate which:
+    # C   alters what is compiled by pcre2_compile()
+    # J   alters what is compiled by pcre2_jit_compile()
+    # M   is inspected during pcre2_match() execution
+    # D   is inspected during pcre2_dfa_match() execution
+    cdef unsigned int PCRE2_ALLOW_EMPTY_CLASS    # C       
+    cdef unsigned int PCRE2_ALT_BSUX             # C       
+    cdef unsigned int PCRE2_AUTO_CALLOUT         # C       
+    cdef unsigned int PCRE2_CASELESS             # C       
+    cdef unsigned int PCRE2_DOLLAR_ENDONLY       #   J M D 
+    cdef unsigned int PCRE2_DOTALL               # C       
+    cdef unsigned int PCRE2_DUPNAMES             # C       
+    cdef unsigned int PCRE2_EXTENDED             # C       
+    cdef unsigned int PCRE2_FIRSTLINE            #   J M D 
+    cdef unsigned int PCRE2_MATCH_UNSET_BACKREF  # C J M   
+    cdef unsigned int PCRE2_MULTILINE            # C       
+    cdef unsigned int PCRE2_NEVER_UCP            # C       
+    cdef unsigned int PCRE2_NEVER_UTF            # C       
+    cdef unsigned int PCRE2_NO_AUTO_CAPTURE      # C       
+    cdef unsigned int PCRE2_NO_AUTO_POSSESS      # C       
+    cdef unsigned int PCRE2_NO_DOTSTAR_ANCHOR    # C       
+    cdef unsigned int PCRE2_NO_START_OPTIMIZE    #   J M D 
+    cdef unsigned int PCRE2_UCP                  # C J M D 
+    cdef unsigned int PCRE2_UNGREEDY             # C       
+    cdef unsigned int PCRE2_UTF                  # C J M D 
+    cdef unsigned int PCRE2_NEVER_BACKSLASH_C    # C       
+    cdef unsigned int PCRE2_ALT_CIRCUMFLEX       #   J M D 
+    cdef unsigned int PCRE2_ALT_VERBNAMES        # C       
+    cdef unsigned int PCRE2_USE_OFFSET_LIMIT     #   J M D 
+    cdef unsigned int PCRE2_EXTENDED_MORE        # C       
+    cdef unsigned int PCRE2_LITERAL              # C       
+    cdef unsigned int PCRE2_MATCH_INVALID_UTF    #   J M D
+
+    # An additional compile options word is available in the compile context. 
+    cdef unsigned int PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  # C 
+    cdef unsigned int PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    # C 
+    cdef unsigned int PCRE2_EXTRA_MATCH_WORD               # C 
+    cdef unsigned int PCRE2_EXTRA_MATCH_LINE               # C 
+    cdef unsigned int PCRE2_EXTRA_ESCAPED_CR_IS_LF         # C 
+    cdef unsigned int PCRE2_EXTRA_ALT_BSUX                 # C 
+    cdef unsigned int PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     # C 
+
+    # These are for pcre2_jit_compile(). 
+    cdef unsigned int PCRE2_JIT_COMPLETE  # For full matching.
+    cdef unsigned int PCRE2_JIT_PARTIAL_SOFT
+    cdef unsigned int PCRE2_JIT_PARTIAL_HARD
+    cdef unsigned int PCRE2_JIT_INVALID_UTF
+
+    # These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and
+    # pcre2_substitute(). Some are allowed only for one of the functions, and
+    # in these cases it is noted below. Note that PCRE2_ANCHORED,
+    # PCRE2_ENDANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these
+    # functions (though pcre2_jit_match() ignores the latter since it bypasses
+    # all sanity checks).
+    cdef unsigned int PCRE2_NOTBOL
+    cdef unsigned int PCRE2_NOTEOL
+    cdef unsigned int PCRE2_NOTEMPTY          # ) These two must be kept
+    cdef unsigned int PCRE2_NOTEMPTY_ATSTART  # ) adjacent to each other. 
+    cdef unsigned int PCRE2_PARTIAL_SOFT
+    cdef unsigned int PCRE2_PARTIAL_HARD
+    cdef unsigned int PCRE2_DFA_RESTART  # pcre2_dfa_match() only 
+    cdef unsigned int PCRE2_DFA_SHORTEST  # pcre2_dfa_match() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_GLOBAL  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_EXTENDED  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_UNSET_EMPTY  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_UNKNOWN_UNSET  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_NO_JIT  # Not for pcre2_dfa_match() 
+    cdef unsigned int PCRE2_COPY_MATCHED_SUBJECT
+    cdef unsigned int PCRE2_SUBSTITUTE_LITERAL  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_MATCHED  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_REPLACEMENT_ONLY  # pcre2_substitute() only 
+
+    # Options for pcre2_pattern_convert(). 
+    cdef unsigned int PCRE2_CONVERT_UTF
+    cdef unsigned int PCRE2_CONVERT_NO_UTF_CHECK
+    cdef unsigned int PCRE2_CONVERT_POSIX_BASIC
+    cdef unsigned int PCRE2_CONVERT_POSIX_EXTENDED
+    cdef unsigned int PCRE2_CONVERT_GLOB
+    cdef unsigned int PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR
+    cdef unsigned int PCRE2_CONVERT_GLOB_NO_STARSTAR
+
+    # Newline and \R settings, for use in compile contexts. The newline values
+    # must be kept in step with values set in config.h and both sets must all
+    # be greater than zero.
+    cdef int PCRE2_NEWLINE_CR
+    cdef int PCRE2_NEWLINE_LF
+    cdef int PCRE2_NEWLINE_CRLF
+    cdef int PCRE2_NEWLINE_ANY
+    cdef int PCRE2_NEWLINE_ANYCRLF
+    cdef int PCRE2_NEWLINE_NUL
+
+    cdef int PCRE2_BSR_UNICODE
+    cdef int PCRE2_BSR_ANYCRLF
+
+    # Error codes for pcre2_compile(). Some of these are also used by
+    # pcre2_pattern_convert().
+    cdef int PCRE2_ERROR_END_BACKSLASH
+    cdef int PCRE2_ERROR_END_BACKSLASH_C
+    cdef int PCRE2_ERROR_UNKNOWN_ESCAPE
+    cdef int PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER
+    cdef int PCRE2_ERROR_QUANTIFIER_TOO_BIG
+    cdef int PCRE2_ERROR_MISSING_SQUARE_BRACKET
+    cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS
+    cdef int PCRE2_ERROR_CLASS_RANGE_ORDER
+    cdef int PCRE2_ERROR_QUANTIFIER_INVALID
+    cdef int PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT
+    cdef int PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY
+    cdef int PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS
+    cdef int PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING
+    cdef int PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS
+    cdef int PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE
+    cdef int PCRE2_ERROR_NULL_PATTERN
+    cdef int PCRE2_ERROR_BAD_OPTIONS
+    cdef int PCRE2_ERROR_MISSING_COMMENT_CLOSING
+    cdef int PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP
+    cdef int PCRE2_ERROR_PATTERN_TOO_LARGE
+    cdef int PCRE2_ERROR_HEAP_FAILED
+    cdef int PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS
+    cdef int PCRE2_ERROR_INTERNAL_CODE_OVERFLOW
+    cdef int PCRE2_ERROR_MISSING_CONDITION_CLOSING
+    cdef int PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH
+    cdef int PCRE2_ERROR_ZERO_RELATIVE_REFERENCE
+    cdef int PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES
+    cdef int PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED
+    cdef int PCRE2_ERROR_BAD_RELATIVE_REFERENCE
+    cdef int PCRE2_ERROR_UNKNOWN_POSIX_CLASS
+    cdef int PCRE2_ERROR_INTERNAL_STUDY_ERROR
+    cdef int PCRE2_ERROR_UNICODE_NOT_SUPPORTED
+    cdef int PCRE2_ERROR_PARENTHESES_STACK_CHECK
+    cdef int PCRE2_ERROR_CODE_POINT_TOO_BIG
+    cdef int PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED
+    cdef int PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C
+    cdef int PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE
+    cdef int PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG
+    cdef int PCRE2_ERROR_MISSING_CALLOUT_CLOSING
+    cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_VERB
+    cdef int PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P
+    cdef int PCRE2_ERROR_MISSING_NAME_TERMINATOR
+    cdef int PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME
+    cdef int PCRE2_ERROR_INVALID_SUBPATTERN_NAME
+    cdef int PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE
+    cdef int PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY
+    cdef int PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY
+    cdef int PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG
+    cdef int PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS
+    cdef int PCRE2_ERROR_CLASS_INVALID_RANGE
+    cdef int PCRE2_ERROR_OCTAL_BYTE_TOO_BIG
+    cdef int PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE
+    cdef int PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN
+    cdef int PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES
+    cdef int PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE
+    cdef int PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE
+    cdef int PCRE2_ERROR_BACKSLASH_G_SYNTAX
+    cdef int PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING
+    # Error 159 is obsolete and should now never occur 
+    cdef int PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED
+    cdef int PCRE2_ERROR_VERB_UNKNOWN
+    cdef int PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG
+    cdef int PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED
+    cdef int PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW
+    cdef int PCRE2_ERROR_INVALID_OCTAL
+    cdef int PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH
+    cdef int PCRE2_ERROR_MARK_MISSING_ARGUMENT
+    cdef int PCRE2_ERROR_INVALID_HEXADECIMAL
+    cdef int PCRE2_ERROR_BACKSLASH_C_SYNTAX
+    cdef int PCRE2_ERROR_BACKSLASH_K_SYNTAX
+    cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS
+    cdef int PCRE2_ERROR_BACKSLASH_N_IN_CLASS
+    cdef int PCRE2_ERROR_CALLOUT_STRING_TOO_LONG
+    cdef int PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT
+    cdef int PCRE2_ERROR_UTF_IS_DISABLED
+    cdef int PCRE2_ERROR_UCP_IS_DISABLED
+    cdef int PCRE2_ERROR_VERB_NAME_TOO_LONG
+    cdef int PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG
+    cdef int PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS
+    cdef int PCRE2_ERROR_VERSION_CONDITION_SYNTAX
+    cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS
+    cdef int PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER
+    cdef int PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER
+    cdef int PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED
+    cdef int PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP
+    cdef int PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED
+    cdef int PCRE2_ERROR_PATTERN_TOO_COMPLICATED
+    cdef int PCRE2_ERROR_LOOKBEHIND_TOO_LONG
+    cdef int PCRE2_ERROR_PATTERN_STRING_TOO_LONG
+    cdef int PCRE2_ERROR_INTERNAL_BAD_CODE
+    cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP
+    cdef int PCRE2_ERROR_NO_SURROGATES_IN_UTF16
+    cdef int PCRE2_ERROR_BAD_LITERAL_OPTIONS
+    cdef int PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE
+    cdef int PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS
+    cdef int PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN
+    cdef int PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE
+    cdef int PCRE2_ERROR_TOO_MANY_CAPTURES
+    cdef int PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED
+    cdef int PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND
+
+    # "Expected" matching error codes: no match and partial match. 
+    cdef int PCRE2_ERROR_NOMATCH
+    cdef int PCRE2_ERROR_PARTIAL
+
+    # Error codes for UTF-8 validity checks.
+    cdef int PCRE2_ERROR_UTF8_ERR1
+    cdef int PCRE2_ERROR_UTF8_ERR2
+    cdef int PCRE2_ERROR_UTF8_ERR3
+    cdef int PCRE2_ERROR_UTF8_ERR4
+    cdef int PCRE2_ERROR_UTF8_ERR5
+    cdef int PCRE2_ERROR_UTF8_ERR6
+    cdef int PCRE2_ERROR_UTF8_ERR7
+    cdef int PCRE2_ERROR_UTF8_ERR8
+    cdef int PCRE2_ERROR_UTF8_ERR9
+    cdef int PCRE2_ERROR_UTF8_ERR10
+    cdef int PCRE2_ERROR_UTF8_ERR11
+    cdef int PCRE2_ERROR_UTF8_ERR12
+    cdef int PCRE2_ERROR_UTF8_ERR13
+    cdef int PCRE2_ERROR_UTF8_ERR14
+    cdef int PCRE2_ERROR_UTF8_ERR15
+    cdef int PCRE2_ERROR_UTF8_ERR16
+    cdef int PCRE2_ERROR_UTF8_ERR17
+    cdef int PCRE2_ERROR_UTF8_ERR18
+    cdef int PCRE2_ERROR_UTF8_ERR19
+    cdef int PCRE2_ERROR_UTF8_ERR20
+    cdef int PCRE2_ERROR_UTF8_ERR21
+
+    # Error codes for UTF-16 validity checks. 
+    cdef int PCRE2_ERROR_UTF16_ERR1
+    cdef int PCRE2_ERROR_UTF16_ERR2
+    cdef int PCRE2_ERROR_UTF16_ERR3
+
+    # Error codes for UTF-32 validity checks.
+    cdef int PCRE2_ERROR_UTF32_ERR1
+    cdef int PCRE2_ERROR_UTF32_ERR2
+
+    # Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction
+    # functions, context functions, and serializing functions. They are in
+    # numerical order. Originally they were in alphabetical order too, but now
+    # that PCRE2 is released, the numbers must not be changed.
+    cdef int PCRE2_ERROR_BADDATA
+    cdef int PCRE2_ERROR_MIXEDTABLES  # Name was changed.
+    cdef int PCRE2_ERROR_BADMAGIC
+    cdef int PCRE2_ERROR_BADMODE
+    cdef int PCRE2_ERROR_BADOFFSET
+    cdef int PCRE2_ERROR_BADOPTION
+    cdef int PCRE2_ERROR_BADREPLACEMENT
+    cdef int PCRE2_ERROR_BADUTFOFFSET
+    cdef int PCRE2_ERROR_CALLOUT  # Never used by PCRE2 itself.
+    cdef int PCRE2_ERROR_DFA_BADRESTART
+    cdef int PCRE2_ERROR_DFA_RECURSE
+    cdef int PCRE2_ERROR_DFA_UCOND
+    cdef int PCRE2_ERROR_DFA_UFUNC
+    cdef int PCRE2_ERROR_DFA_UITEM
+    cdef int PCRE2_ERROR_DFA_WSSIZE
+    cdef int PCRE2_ERROR_INTERNAL
+    cdef int PCRE2_ERROR_JIT_BADOPTION
+    cdef int PCRE2_ERROR_JIT_STACKLIMIT
+    cdef int PCRE2_ERROR_MATCHLIMIT
+    cdef int PCRE2_ERROR_NOMEMORY
+    cdef int PCRE2_ERROR_NOSUBSTRING
+    cdef int PCRE2_ERROR_NOUNIQUESUBSTRING
+    cdef int PCRE2_ERROR_NULL
+    cdef int PCRE2_ERROR_RECURSELOOP
+    cdef int PCRE2_ERROR_DEPTHLIMIT
+    cdef int PCRE2_ERROR_RECURSIONLIMIT  # Obsolete synonym. 
+    cdef int PCRE2_ERROR_UNAVAILABLE
+    cdef int PCRE2_ERROR_UNSET
+    cdef int PCRE2_ERROR_BADOFFSETLIMIT
+    cdef int PCRE2_ERROR_BADREPESCAPE
+    cdef int PCRE2_ERROR_REPMISSINGBRACE
+    cdef int PCRE2_ERROR_BADSUBSTITUTION
+    cdef int PCRE2_ERROR_BADSUBSPATTERN
+    cdef int PCRE2_ERROR_TOOMANYREPLACE
+    cdef int PCRE2_ERROR_BADSERIALIZEDDATA
+    cdef int PCRE2_ERROR_HEAPLIMIT
+    cdef int PCRE2_ERROR_CONVERT_SYNTAX
+    cdef int PCRE2_ERROR_INTERNAL_DUPMATCH
+    cdef int PCRE2_ERROR_DFA_UINVALID_UTF
+
+    # Request types for pcre2_pattern_info().
+    cdef int PCRE2_INFO_ALLOPTIONS
+    cdef int PCRE2_INFO_ARGOPTIONS
+    cdef int PCRE2_INFO_BACKREFMAX
+    cdef int PCRE2_INFO_BSR
+    cdef int PCRE2_INFO_CAPTURECOUNT
+    cdef int PCRE2_INFO_FIRSTCODEUNIT
+    cdef int PCRE2_INFO_FIRSTCODETYPE
+    cdef int PCRE2_INFO_FIRSTBITMAP
+    cdef int PCRE2_INFO_HASCRORLF
+    cdef int PCRE2_INFO_JCHANGED
+    cdef int PCRE2_INFO_JITSIZE
+    cdef int PCRE2_INFO_LASTCODEUNIT
+    cdef int PCRE2_INFO_LASTCODETYPE
+    cdef int PCRE2_INFO_MATCHEMPTY
+    cdef int PCRE2_INFO_MATCHLIMIT
+    cdef int PCRE2_INFO_MAXLOOKBEHIND
+    cdef int PCRE2_INFO_MINLENGTH
+    cdef int PCRE2_INFO_NAMECOUNT
+    cdef int PCRE2_INFO_NAMEENTRYSIZE
+    cdef int PCRE2_INFO_NAMETABLE
+    cdef int PCRE2_INFO_NEWLINE
+    cdef int PCRE2_INFO_DEPTHLIMIT
+    cdef int PCRE2_INFO_RECURSIONLIMIT  # Obsolete synonym 
+    cdef int PCRE2_INFO_SIZE
+    cdef int PCRE2_INFO_HASBACKSLASHC
+    cdef int PCRE2_INFO_FRAMESIZE
+    cdef int PCRE2_INFO_HEAPLIMIT
+    cdef int PCRE2_INFO_EXTRAOPTIONS
+
+    # Request types for pcre2_config(). 
+    cdef int PCRE2_CONFIG_BSR
+    cdef int PCRE2_CONFIG_JIT
+    cdef int PCRE2_CONFIG_JITTARGET
+    cdef int PCRE2_CONFIG_LINKSIZE
+    cdef int PCRE2_CONFIG_MATCHLIMIT
+    cdef int PCRE2_CONFIG_NEWLINE
+    cdef int PCRE2_CONFIG_PARENSLIMIT
+    cdef int PCRE2_CONFIG_DEPTHLIMIT
+    cdef int PCRE2_CONFIG_RECURSIONLIMIT  # Obsolete synonym 
+    cdef int PCRE2_CONFIG_STACKRECURSE  # Obsolete 
+    cdef int PCRE2_CONFIG_UNICODE
+    cdef int PCRE2_CONFIG_UNICODE_VERSION
+    cdef int PCRE2_CONFIG_VERSION
+    cdef int PCRE2_CONFIG_HEAPLIMIT
+    cdef int PCRE2_CONFIG_NEVER_BACKSLASH_C
+    cdef int PCRE2_CONFIG_COMPILED_WIDTHS
+    cdef int PCRE2_CONFIG_TABLES_LENGTH
+
+
+    # Opaque handles for PCRE2 defined structs.
+    ctypedef struct pcre2_code_t "pcre2_code":
+        pass
+    ctypedef struct pcre2_match_data_t "pcre2_match_data":
+        pass
+    ctypedef struct pcre2_general_context_t "pcre2_general_context":
+        pass
+    ctypedef struct pcre2_compile_context_t "pcre2_compile_context":
+        pass
+    ctypedef struct pcre2_match_context_t "pcre2_match_context":
+        pass
+
+    # Basic string definition. Note that this assumes PCRE2 in compiled to
+    # support 8-bit strings.
+    ctypedef const uint8_t *pcre2_sptr_t "PCRE2_SPTR"
+
+    # Error handling functions.
+    int pcre2_get_error_message(
+        int errorcode,
+        uint8_t *buffer,
+        size_t bufflen
+    )
+
+    # Pattern compilation functions.
+    pcre2_code_t * pcre2_compile(
+        pcre2_sptr_t pattern, 
+        size_t length,
+        uint32_t options,
+        int *errorcode,
+        size_t *erroroffset,
+        pcre2_compile_context_t *ccontext
+    )
+
+    int pcre2_jit_compile(
+        pcre2_code_t *code,
+        uint32_t options
+    )
+
+
+    void pcre2_code_free(pcre2_code_t *code)
+
+    # Information on compiled pattern.
+    int pcre2_pattern_info(
+        const pcre2_code_t *code,
+        uint32_t what,
+        void *where
+    )
+
+    int pcre2_substring_number_from_name(
+        const pcre2_code_t *code,
+        pcre2_sptr_t name
+    )
+    
+    # Matching and match data functions.
+    pcre2_match_data_t * pcre2_match_data_create(
+        uint32_t ovecsize,
+        pcre2_general_context_t *gcontext
+    )
+    
+    pcre2_match_data_t * pcre2_match_data_create_from_pattern(
+        const pcre2_code_t *code,
+        pcre2_general_context_t *gcontext
+    )
+    
+    int pcre2_match(
+        const pcre2_code_t *code,
+        pcre2_sptr_t subject,
+        size_t length,
+        size_t startoffset,
+        uint32_t options,
+        pcre2_match_data_t *match_data,
+        pcre2_match_context_t *mcontext
+    )
+    int pcre2_jit_match(
+        const pcre2_code_t *code,
+        pcre2_sptr_t subject,
+        size_t length,
+        size_t startoffset,
+        uint32_t options,
+        pcre2_match_data_t *match_data,
+        pcre2_match_context_t *mcontext
+    )
+    
+    void pcre2_match_data_free(pcre2_match_data_t *match_data)
+
+    uint32_t pcre2_get_ovector_count(pcre2_match_data_t *match_data)
+
+    size_t *pcre2_get_ovector_pointer(pcre2_match_data_t *match_data)
+
+    int pcre2_substring_nametable_scan(
+        const pcre2_code_t *code,
+        pcre2_sptr_t name,
+        pcre2_sptr_t *first,
+        pcre2_sptr_t *last
+    )
+
+    # String extraction from match data blocks.
+    int pcre2_substring_length_byname(
+        pcre2_match_data_t *match_data,
+        pcre2_sptr_t name,
+        size_t *bufflen
+    )
+
+    int pcre2_substring_get_byname(
+        pcre2_match_data_t *match_data,
+        pcre2_sptr_t name, 
+        uint8_t **bufferptr,
+        size_t *bufflen
+    )
+
+    int pcre2_substring_length_bynumber(
+        pcre2_match_data_t *match_data,
+        uint32_t number,
+        size_t *bufflen
+    )
+
+    int pcre2_substring_get_bynumber(
+        pcre2_match_data_t *match_data,
+        uint32_t number,
+        uint8_t **bufferptr,
+        size_t *bufflen
+    )
+
+    # Substitution.
+    int pcre2_substitute(
+        const pcre2_code_t *code,
+        pcre2_sptr_t subject,
+        size_t length,
+        size_t startoffset,
+        uint32_t options,
+        pcre2_match_data_t *match_data,
+        pcre2_match_context_t *mcontext,
+        pcre2_sptr_t replacement,
+        size_t rlength,
+        uint8_t *outputbuffer,
+        size_t *outlengthptr
+    )
+
+    # Serialization.
+    int32_t pcre2_serialize_decode(
+        pcre2_code_t **codes,
+        int32_t number_of_codes,
+        const uint8_t *code_bytes,
+        pcre2_general_context_t *gcontex
+    )
+    int32_t pcre2_serialize_encode(
+        pcre2_code_t **codes,
+        int32_t number_of_codes,
+        uint8_t **serialized_bytes,
+        size_t *serialized_size,
+        pcre2_general_context_t *gcontex
+    )
+    void pcre2_serialize_free(uint8_t *bytes)
diff --git a/tests/test_groups.py b/tests/test_groups.py
new file mode 100644 (file)
index 0000000..a7a8c28
--- /dev/null
@@ -0,0 +1,14 @@
+import pytest
+import pcre2
+
+
+def test_match_groups():
+    assert pcre2.match("a", "a").groups() == ()
+    assert pcre2.match("(a)", "a").groups() == ("a",)
+
+    assert pcre2.match(b"a", b"a").groups() == ()
+    assert pcre2.match(b"(a)", b"a").groups() == (b"a",)
+
+    for a in ("\xe0", "\u0430", "\U0001d49c"):
+        assert pcre2.match(a, a).groups() == ()
+        assert pcre2.match("(%s)" % a, a).groups() == (a,)
diff --git a/tests/test_match.py b/tests/test_match.py
new file mode 100644 (file)
index 0000000..b083776
--- /dev/null
@@ -0,0 +1,58 @@
+import pytest
+import pcre2
+import re
+
+
+# All tests should match successfully.
+test_data_match_bounds = [
+    (b".*", "aba•ba••ba•••b".encode(), 0, 0, None, 0, 0, 26),
+    (".*", "aba•ba••ba•••b", 0, 0, None, 0, 0, 14),
+    (r"\w+", "b•", 0, 0, None, 0, 0, 1),
+    (r"\w+", "b•", 0, None, None, 0, 0, 1),
+    (r"\w+", "•b", 0, 1, None, 0, 1, 2),
+    (r"\w+", "•bc", 0, 2, None, 0, 2, 3),
+    (r"\w+", "•bc", 0, 1, 2, 0, 1, 2),
+]
+
+
+@pytest.mark.parametrize("pattern,subject,flags,pos,endpos,group,start,end", test_data_match_bounds)
+def test_match_bounds(pattern, subject, flags, pos, endpos, group, start, end):
+    p = pcre2.compile(pattern, flags=flags)
+    kwargs = {}
+    if endpos is not None:
+        kwargs["endpos"] = endpos
+    if pos is not None:
+        kwargs["pos"] = pos
+    m = p.match(subject, **kwargs)
+    assert (m.start(group), m.end(group)) == (start, end)
+    if endpos is not None:
+        assert m.endpos == endpos
+    if pos is not None:
+        assert m.pos == pos
+
+
+test_data_match_substring = [
+    (b".*", "aba•ba••ba•••b".encode(), 0, 0, "aba•ba••ba•••b".encode()),
+    (".*", "aba•ba••ba•••b", 0, 0, "aba•ba••ba•••b"),
+]
+
+
+@pytest.mark.parametrize("pattern,subject,flags,pos,substring", test_data_match_substring)
+def test_match_substring(pattern, subject, flags, pos, substring):
+    p = pcre2.compile(pattern, flags=flags)
+    m = p.match(subject, pos=pos)
+    assert m[0] == substring
+
+
+test_data_match_expand = [
+    (b"[abc]+", b"$0", b"dabacbaccbacccb", 0, 0, b"abacbaccbacccb"),
+    ("[abc]+", "$0", "dabacbaccbacccb", 0, 0, "abacbaccbacccb"),
+    ("[abc]+", "$0", "dabacbaccbacccb", 0, 10, "acccb"),
+]
+
+
+@pytest.mark.parametrize("pattern,replacement,subject,flags,pos,result", test_data_match_expand)
+def test_match_expand(pattern, replacement, subject, flags, pos, result):
+    p = pcre2.compile(pattern, flags=flags)
+    m = p.search(subject, pos=pos)
+    assert m.expand(replacement) == result
diff --git a/tests/test_pattern.py b/tests/test_pattern.py
new file mode 100644 (file)
index 0000000..c3c67b2
--- /dev/null
@@ -0,0 +1,237 @@
+import pytest
+import pcre2
+from pcre2._cy import LibraryError
+
+
+test_data_pattern_compile_success = [
+    (b"a+b+c*d*", 0, "SUCCESS"),
+    (b"(?<foo>a+b+)c*d*", 0, "SUCCESS"),
+    (b"(?<foo>a+b+))c*d*", 0, "COMPILE_ERROR"),
+    ("å+∫+ç*∂*".encode(), 0, "SUCCESS"),
+    ("a+b+c*d*", 0, "SUCCESS"),
+    ("(?<foo>a+b+)c*d*", 0, "SUCCESS"),
+    ("(?<foo>a+b+))c*d*", 0, "COMPILE_ERROR"),
+    ("(?<<foo>a+b+)c*d*", 0, "COMPILE_ERROR"),
+    ("(?<foo>a+b+)c*d*(?<foo>a+b+)", 0, "COMPILE_ERROR"),
+    ("å+∫+ç*∂*", 0, "SUCCESS"),
+    ("(?<ƒøø>a+b+)c*d*", 0, "SUCCESS"),
+]
+
+
+@pytest.mark.parametrize("pattern,flags,return_code", test_data_pattern_compile_success)
+def test_pattern_compile_success(pattern, flags, return_code):
+    try:
+        p = pcre2.compile(pattern, flags=flags, jit=False)
+        rc = "SUCCESS"
+        assert not p.jit
+    except pcre2.PatternError:
+        rc = "COMPILE_ERROR"
+    except pcre2.LibraryError:
+        rc = "LIB_ERROR"
+    assert rc == return_code
+
+
+@pytest.mark.parametrize("pattern,flags,return_code", test_data_pattern_compile_success)
+def test_pattern_jit_compile_success(pattern, flags, return_code):
+    try:
+        p = pcre2.compile(pattern, flags=flags, jit=True)
+        rc = "SUCCESS"
+        assert p.jit
+    except pcre2.PatternError:
+        rc = "COMPILE_ERROR"
+    except pcre2.LibraryError:
+        rc = "LIB_ERROR"
+    assert rc == return_code
+
+
+test_data_pattern_groupindex = [
+    (b"(?<foo>a+b+)c*d*", 0, {"foo": 1}),
+    ("(?<foo>a+b+)c*d*", 0, {"foo": 1}),
+    ("(?<ƒøø>a+b+)c*d*", 0, {"ƒøø": 1}),
+    ("(?<foo>a+b+)c*d*(?<bar>a+b+)", 0, {"foo": 1, "bar": 2}),
+    ("(?<foo>a+b+)c*(.+)d*(?<bar>a+b+)", 0, {"foo": 1, "bar": 3}),
+]
+
+
+@pytest.mark.parametrize("pattern,flags,groupindex", test_data_pattern_groupindex)
+def test_pattern_groupindex(pattern, flags, groupindex):
+    p = pcre2.compile(pattern, flags=flags)
+    assert p.groupindex == groupindex
+
+
+test_data_pattern_match_success = [
+    (b".*", b"abacbaccbacccb", 0, 0, "SUCCESS"),
+    (".*", "abacbaccbacccb", 0, 0, "SUCCESS"),
+    ("ac{3,}b", "abacbaccbacccb", 0, 0, "SUCCESS"),
+    ("a•{3,}b", "aba•ba••ba•••b", 0, 0, "SUCCESS"),
+    ("ab", "abacbaccbacccb", 0, 2, "UNMATCHED"),
+    ("((((((((((((((()))))))))))))))", "", 0, 0, "SUCCESS"),
+]
+
+
+@pytest.mark.parametrize("pattern,subject,flags,pos,return_code", test_data_pattern_match_success)
+def test_pattern_match_success(pattern, subject, flags, pos, return_code):
+    p = pcre2.compile(pattern, flags=flags)
+    try:
+        m = p.search(subject, pos=pos)
+        rc = "SUCCESS" if m else "UNMATCHED"
+    except LibraryError as e:
+        rc = "LIB_ERROR"
+    assert rc == return_code
+
+
+test_data_pattern_scan_length = [
+    (b".+", b"abacbaccbacccb", 0, 1),
+    (b".*", b"abacbaccbacccb", 0, 2),
+    (".+", "abacbaccbacccb", 0, 1),
+    (".*", "abacbaccbacccb", 0, 2),
+    ("[abc]*", "dabacbaccbacccb", 0, 3),
+    ("ac{2,}b", "abacbaccbacccb", 0, 2),
+    ("a•{2,}b", "aba•ba••ba•••b", 0, 2),
+    ("a•*b", "aba•ba••ba•••b", 0, 4),
+    ("ab", "abacbaccbacccb", 2, 0),
+]
+
+
+@pytest.mark.parametrize("pattern,subject,pos,iter_length", test_data_pattern_scan_length)
+def test_pattern_scan_length(pattern, subject, pos, iter_length):
+    p = pcre2.compile(pattern)
+    s = p.finditer(subject, pos=pos)
+    assert len(list(iter(s))) == iter_length
+
+
+test_pattern_substitute = [
+    (b"[abc]*", b"", b"dabacbaccbacccb", 1, b"dabacbaccbacccb"),
+    ("[abc]*", "", "dabacbaccbacccb", 1, "dabacbaccbacccb"),
+    ("[abc]*", "", "dabacbaccbacccb", 0, "d"),
+    ("a(•{2,})b", "a•b", "aba•ba••ba•••b", 0, "aba•ba•ba•b"),
+    ("a(•{2,})b", "a$1b", "aba•ba••ba•••b", 0, "aba•ba••ba•••b"),
+    ("a(•{2,})b", lambda m: m[0] + m[0], "aba•ba••ba•••b", 0, "aba•ba••ba••ba•••ba•••b"),
+    ("a(•{2,})b", lambda m: m[1] + m[1], "aba•ba••ba•••b", 0, "aba•b••••••••••"),
+]
+
+
+@pytest.mark.parametrize("pattern,replacement,subject,count,result", test_pattern_substitute)
+def test_pattern_substitute(pattern, replacement, subject, count, result):
+    p = pcre2.compile(pattern)
+    assert p.sub(replacement, subject, count) == result
+
+
+def test_pattern_findall():
+    p = pcre2.compile(r"(\w+)=(\d+)")
+    assert p.findall("set width=20 and height=10") == [("width", "20"), ("height", "10")]
+    s = bytes(range(128)).decode()
+    p2 = pcre2.compile(r"[0-9--1]")
+    assert p2.findall(s) == list("-./0123456789")
+    p3 = pcre2.compile(r"[%--1]")
+    assert p3.findall(s) == list("%&'()*+,-1")
+    p4 = pcre2.compile(r"[%--]")
+    assert p4.findall(s) == list("%&'()*+,-")
+    p5 = pcre2.compile(r"[0-9&&1]")
+    assert p5.findall(s) == list("&0123456789")
+    p6 = pcre2.compile(r"[\d&&1]")
+    assert p6.findall(s) == list("&0123456789")
+    p7 = pcre2.compile(r"[0-9||a]")
+    assert p7.findall(s) == list("0123456789a|")
+    p8 = pcre2.compile(r"[\d||a]")
+    assert p8.findall(s) == list("0123456789a|")
+    p9 = pcre2.compile(r"[0-9~~1]")
+    assert p9.findall(s) == list("0123456789~")
+    p10 = pcre2.compile(r"[\d~~1]")
+    assert p10.findall(s) == list("0123456789~")
+    p11 = pcre2.compile(r"[[0-9]|]")
+    assert p11.findall(s) == list("0123456789[]")
+
+    for reps in "*", "+", "?", "{1}":
+        for mod in "", "?":
+            pattern = "." + reps + mod + "yz"
+            assert pcre2.compile(pattern, pcre2.S).findall("xyz") == ["xyz"], pattern
+            pattern = pattern.encode()
+            assert pcre2.compile(pattern, pcre2.S).findall(b"xyz") == [b"xyz"], pattern
+
+
+def test_pattern_jit_findall():
+    assert pcre2.findall(r"(\w+)=(\d+)", "set width=20 and height=10") == [
+        ("width", "20"),
+        ("height", "10"),
+    ]
+    assert pcre2.findall(":+", "abc") == []
+    assert pcre2.findall(":+", "a:b::c:::d") == [":", "::", ":::"]
+    assert pcre2.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"]
+
+    for x in ("\xe0", "\u0430", "\U0001d49c"):
+        xx = x * 2
+        xxx = x * 3
+        string = "a%sb%sc%sd" % (x, xx, xxx)
+        assert pcre2.findall("%s+" % x, string) == [x, xx, xxx]
+        assert pcre2.findall("(%s+)" % x, string) == [x, xx, xxx]
+
+    assert len(pcre2.findall(r"\b", "a")) == 2
+    assert len(pcre2.findall(r"\B", "a")) == 0
+    assert len(pcre2.findall(r"\b", " ")) == 0
+    assert len(pcre2.findall(r"\b", "   ")) == 0
+    assert len(pcre2.findall(r"\B", " ")) == 2
+
+    s = bytes(range(128)).decode()
+    assert pcre2.findall(r"[--1]", s) == list("-./01")
+    assert pcre2.findall(r"[&&1]", s) == list("&1")
+    assert pcre2.findall(r"[||1]", s) == list("1|")
+    assert pcre2.findall(r"[~~1]", s) == list("1~")
+
+    assert pcre2.findall(r"(?i)(a)\1", "aa \u0100") == ["a"]
+
+    assert pcre2.findall(r"a++", "aab") == ["aa"]
+    assert pcre2.findall(r"a*+", "aab") == ["aa", "", ""]
+    assert pcre2.findall(r"a?+", "aab") == ["a", "a", "", ""]
+    assert pcre2.findall(r"a{1,3}+", "aab") == ["aa"]
+
+    assert pcre2.findall(r"(?:ab)++", "ababc") == ["abab"]
+    assert pcre2.findall(r"(?:ab)*+", "ababc") == ["abab", "", ""]
+    assert pcre2.findall(r"(?:ab)?+", "ababc") == ["ab", "ab", "", ""]
+    assert pcre2.findall(r"(?:ab){1,3}+", "ababc") == ["abab"]
+
+    assert pcre2.findall(r"(?>a+)", "aab") == ["aa"]
+    assert pcre2.findall(r"(?>a*)", "aab") == ["aa", "", ""]
+    assert pcre2.findall(r"(?>a?)", "aab") == ["a", "a", "", ""]
+    assert pcre2.findall(r"(?>a{1,3})", "aab") == ["aa"]
+
+    assert pcre2.findall(r"(?>(?:ab)+)", "ababc") == ["abab"]
+    assert pcre2.findall(r"(?>(?:ab)*)", "ababc") == ["abab", "", ""]
+    assert pcre2.findall(r"(?>(?:ab)?)", "ababc") == ["ab", "ab", "", ""]
+    assert pcre2.findall(r"(?>(?:ab){1,3})", "ababc") == ["abab"]
+
+    import re
+
+    b = "y\u2620y\u2620y".encode("utf-8")
+    assert len(pcre2.findall(re.escape("\u2620".encode("utf-8")), b)) == 2
+
+
+def test_pattern_split():
+    pattern = "[\u002e\u3002\uff0e\uff61]"
+    assert pcre2.compile(pattern).split("a.b.c") == ["a", "b", "c"]
+
+
+def test_pattern_jit_split():
+    assert pcre2.split(":", ":a:b::c") == ["", "a", "b", "", "c"]
+    assert pcre2.split(":+", ":a:b::c") == ["", "a", "b", "c"]
+    assert pcre2.split("(:+)", ":a:b::c") == ["", ":", "a", ":", "b", "::", "c"]
+
+    assert pcre2.split(b":", b":a:b::c") == [b"", b"a", b"b", b"", b"c"]
+    assert pcre2.split(b":+", b":a:b::c") == [b"", b"a", b"b", b"c"]
+    assert pcre2.split(b"(:+)", b":a:b::c") == [b"", b":", b"a", b":", b"b", b"::", b"c"]
+
+    for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", "\U0001d49c\U0001d49e\U0001d4b5"):
+        string = ":%s:%s::%s" % (a, b, c)
+        assert pcre2.split(":", string) == ["", a, b, "", c]
+        assert pcre2.split(":+", string) == ["", a, b, c]
+        assert pcre2.split("(:+)", string) == ["", ":", a, ":", b, "::", c]
+
+    assert pcre2.split("(?::+)", ":a:b::c") == ["", "a", "b", "c"]
+    assert pcre2.split("([b:]+)", ":a:b::c") == ["", ":", "a", ":b::", "c"]
+    assert pcre2.split("(?:b)|(?::+)", ":a:b::c") == ["", "a", "", "", "c"]
+
+    assert pcre2.split(":", ":a:b::c", 2) == ["", "a", "b::c"]
+    assert pcre2.split(":", ":a:b::c", maxsplit=2) == ["", "a", "b::c"]
+    assert pcre2.split(":", "a:b:c:d", maxsplit=2) == ["a", "b", "c:d"]
+    assert pcre2.split("(:)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"]
+    assert pcre2.split("(:+)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"]
diff --git a/tests/test_re_compatibility.py b/tests/test_re_compatibility.py
new file mode 100644 (file)
index 0000000..8909512
--- /dev/null
@@ -0,0 +1,1918 @@
+import pcre2 as re
+import string
+import multiprocessing
+from weakref import proxy
+import pytest
+
+from tests.utils import (
+    assert_raises,
+    assert_typed_equal,
+    check_pattern_error,
+    check_template_error,
+)
+
+# This file is a modified version of the tests from CPython's regex test suite, meant to provide
+# coverage for the built-in module's behavior. However, the intention is not to cover 100% of
+# Python tests. Some functionality will remain different, such as the equality of compiled
+# patterns. The goal is to cover enough of the API to make using PCRE2 feel like using the built-in
+# module. For the tests included, you can find original versions in the link below (Python bug IDs
+# are preserved for searching):
+#     https://github.com/python/cpython/blob/3.14/Lib/test/test_re.py
+
+
+class S(str):
+    def __getitem__(self, index):
+        return S(super().__getitem__(index))
+
+
+class B(bytes):
+    def __getitem__(self, index):
+        return B(super().__getitem__(index))
+
+
+def test_weakref():
+    s = "QabbbcR"
+    x = re.compile("ab+c")
+    y = proxy(x)
+    assert x.findall("QabbbcR") == y.findall("QabbbcR")
+
+
+def test_search_star_plus():
+    assert re.search("x*", "axx").span(0) == (0, 0)
+    assert re.search("x*", "axx").span() == (0, 0)
+    assert re.search("x+", "axx").span(0) == (1, 3)
+    assert re.search("x+", "axx").span() == (1, 3)
+    assert re.search("x", "aaa") is None
+    assert re.match("a*", "xxx").span(0) == (0, 0)
+    assert re.match("a*", "xxx").span() == (0, 0)
+    assert re.match("x*", "xxxa").span(0) == (0, 3)
+    assert re.match("x*", "xxxa").span() == (0, 3)
+    assert re.match("a+", "xxx") is None
+
+
+def test_branching():
+    """Test Branching
+    Test expressions using the OR ('|') operator."""
+    assert re.match("(ab|ba)", "ab").span() == (0, 2)
+    assert re.match("(ab|ba)", "ba").span() == (0, 2)
+    assert re.match("(abc|bac|ca|cb)", "abc").span() == (0, 3)
+    assert re.match("(abc|bac|ca|cb)", "bac").span() == (0, 3)
+    assert re.match("(abc|bac|ca|cb)", "ca").span() == (0, 2)
+    assert re.match("(abc|bac|ca|cb)", "cb").span() == (0, 2)
+    assert re.match("((a)|(b)|(c))", "a").span() == (0, 1)
+    assert re.match("((a)|(b)|(c))", "b").span() == (0, 1)
+    assert re.match("((a)|(b)|(c))", "c").span() == (0, 1)
+
+
+def bump_num(matchobj):
+    int_value = int(matchobj.group(0))
+    return str(int_value + 1)
+
+
+def test_basic_re_sub():
+    assert_typed_equal(re.sub("y", "a", "xyz"), "xaz")
+    assert_typed_equal(re.sub("y", S("a"), S("xyz")), "xaz")
+    assert_typed_equal(re.sub(b"y", b"a", b"xyz"), b"xaz")
+    assert_typed_equal(re.sub(b"y", B(b"a"), B(b"xyz")), b"xaz")
+    assert_typed_equal(re.sub(b"y", bytearray(b"a"), bytearray(b"xyz")), b"xaz")
+    assert_typed_equal(re.sub(b"y", memoryview(b"a"), memoryview(b"xyz")), b"xaz")
+
+    for y in ("\xe0", "\u0430", "\U0001d49c"):
+        assert re.sub(y, "a", "x%sz" % y) == "xaz"
+
+    assert re.sub("(?i)b+", "x", "bbbb BBBB") == "x x"
+    assert re.sub(r"\d+", bump_num, "08.2 -2 23x99y") == "9.3 -3 24x100y"
+
+    assert re.sub(r"\d+", bump_num, "08.2 -2 23x99y", count=3) == "9.3 -3 23x99y"
+
+    assert re.sub(".", lambda m: r"\n", "x") == "\\n"
+    assert re.sub(".", r"\n", "x") == "\n"
+
+    s = r"\g<1>\g<1>"
+    assert re.sub("(.)", s, "x") == "xx"
+    assert re.sub("(.)", s.replace("\\", r"\\"), "x") == s
+    assert re.sub("(.)", lambda m: s, "x") == s
+
+    assert re.sub("(?P<a>x)", r"\g<a>\g<a>", "xx") == "xxxx"
+    assert re.sub("(?P<a>x)", r"\g<a>\g<1>", "xx") == "xxxx"
+    assert re.sub("(?P<unk>x)", r"\g<unk>\g<unk>", "xx") == "xxxx"
+    assert re.sub("(?P<unk>x)", r"\g<1>\g<1>", "xx") == "xxxx"
+    assert re.sub("()x", r"\g<0>\g<0>", "xx") == "xxxx"
+
+    assert re.sub("a", r"\t\n\v\r\f\a\b", "a") == "\t\n\v\r\f\a\b"
+    assert re.sub("a", "\t\n\v\r\f\a\b", "a") == "\t\n\v\r\f\a\b"
+    assert re.sub("a", "\t\n\v\r\f\a\b", "a") == (
+        chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7) + chr(8)
+    )
+
+    # Note that we removed the reserved characters in PCRE2 extended substitution syntax
+    for c in "cdhijkmopqswxyzABCDFGHIJKMNOPRSTVWXYZ":
+        with pytest.raises(re.LibraryError):
+            assert re.sub("a", "\\" + c, "a") == "\\" + c
+
+    assert re.sub(r"^\s*", "X", "test") == "Xtest"
+
+
+def test_bug_449964():
+    # fails for group followed by other escape
+    assert re.sub(r"(?P<unk>x)", r"\g<1>\g<1>\b", "xx") == "xx\bxx\b"
+
+
+def test_bug_449000():
+    # Test for sub() on escaped characters
+    assert re.sub(r"\r\n", r"\n", "abc\r\ndef\r\n") == "abc\ndef\n"
+    assert re.sub("\r\n", r"\n", "abc\r\ndef\r\n") == "abc\ndef\n"
+    assert re.sub(r"\r\n", "\n", "abc\r\ndef\r\n") == "abc\ndef\n"
+    assert re.sub("\r\n", "\n", "abc\r\ndef\r\n") == "abc\ndef\n"
+
+
+def test_bug_1661():
+    # Verify that flags do not get silently ignored with compiled patterns
+    pattern = re.compile(".")
+    assert_raises(ValueError, re.match, pattern, "A", re.I)
+    assert_raises(ValueError, re.search, pattern, "A", re.I)
+    assert_raises(ValueError, re.findall, pattern, "A", re.I)
+    assert_raises(ValueError, re.compile, pattern, re.I)
+
+
+def test_bug_3629():
+    # A regex that triggered a bug in the sre-code validator
+    re.compile("(?P<quote>)(?(quote))")
+
+
+def test_sub_template_numeric_escape():
+    # bug 776311 and friends
+    assert re.sub("x", r"\0", "x") == "\0"
+    assert re.sub("x", r"\000", "x") == "\000"
+    assert re.sub("x", r"\001", "x") == "\001"
+    assert re.sub("x", r"\008", "x") == "\0" + "8"
+    assert re.sub("x", r"\009", "x") == "\0" + "9"
+    assert re.sub("x", r"\111", "x") == "\111"
+    assert re.sub("x", r"\117", "x") == "\117"
+    assert re.sub("x", r"\377", "x") == "\377"
+
+    assert re.sub("x", r"\1111", "x") == "\1111"
+    assert re.sub("x", r"\1111", "x") == "\111" + "1"
+
+    assert re.sub("x", r"\00", "x") == "\x00"
+    assert re.sub("x", r"\07", "x") == "\x07"
+    assert re.sub("x", r"\08", "x") == "\0" + "8"
+    assert re.sub("x", r"\09", "x") == "\0" + "9"
+    assert re.sub("x", r"\0a", "x") == "\0" + "a"
+
+    # in python2.3 (etc), these loop endlessly in sre_parser.py
+
+    assert re.sub("(((((((((((x)))))))))))", r"\11", "x") == "x"
+    assert re.sub("((((((((((y))))))))))(.)", r"\11a", "xyz") == "xza"
+
+    # Modified for different parsing behavior in PCRE2
+    assert re.sub("((((((((((y))))))))))(.)", r"\g<11>8", "xyz") == "xz8"
+
+
+def test_qualified_re_sub():
+    assert re.sub("a", "b", "aaaaa") == "bbbbb"
+    assert re.sub("a", "b", "aaaaa", count=1) == "baaaa"
+
+    with pytest.raises(TypeError, match=r"sub\(\) got multiple values for argument 'count'"):
+        re.sub("a", "b", "aaaaa", 1, count=1)
+    with pytest.raises(TypeError, match=r"sub\(\) got multiple values for argument 'flags'"):
+        re.sub("a", "b", "aaaaa", 1, 0, flags=0)
+    with pytest.raises(
+        TypeError, match=r"sub\(\) takes from 3 to 6 positional arguments but 7 were given"
+    ):
+        re.sub("a", "b", "aaaaa", 1, 0, False, 0)
+
+
+def test_bug_114660():
+    assert re.sub(r"(\S)\s+(\S)", r"\1 \2", "hello  there") == "hello there"
+
+
+def test_symbolic_groups():
+    re.compile(r"(?P<a>x)(?P=a)(?(a)y)")
+    re.compile(r"(?P<a1>x)(?P=a1)(?(a1)y)")
+    re.compile(r"(?P<a1>x)\1(?(1)y)")
+    re.compile(b"(?P<a1>x)(?P=a1)(?(a1)y)")
+    # New valid identifiers in Python 3
+    re.compile("(?P<µ>x)(?P=µ)(?(µ)y)")
+    re.compile("(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)")
+    # Support > 100 groups.
+    pat = "|".join("x(?P<a%d>%x)y" % (i, i) for i in range(1, 200 + 1))
+    pat = "(?:%s)(?(200)z|t)" % pat
+    assert re.match(pat, "xc8yz").span() == (0, 5)
+
+
+def test_symbolic_groups_errors():
+    # This test originally tested error messages, but we only test failure of compilation as
+    # messages are managed bt PCRE2
+    check_pattern_error(r"(?P<a>)(?P<a>)")
+    check_pattern_error(r"(?Pxy)")
+    check_pattern_error(r"(?P<a>)(?P=a")
+    check_pattern_error(r"(?P=")
+    check_pattern_error(r"(?P=)aaaaaaaaaaaaaaa")
+    check_pattern_error(r"(?P=1)")
+    check_pattern_error(r"(?P=a)")
+    check_pattern_error(r"(?P=a1)")
+    check_pattern_error(r"(?P=a.)")
+    check_pattern_error(r"(?P<)")
+    check_pattern_error(r"(?P<a")
+    check_pattern_error(r"(?P<")
+    check_pattern_error(r"(?P<>)")
+    check_pattern_error(r"(?P<1>)")
+    check_pattern_error(r"(?P<a.>)")
+    check_pattern_error(r"(?(")
+    check_pattern_error(r"(?())")
+    check_pattern_error(r"(?(a))")
+    check_pattern_error(r"(?(-1))")
+    check_pattern_error(r"(?(1a))")
+    check_pattern_error(r"(?(a.))")
+    check_pattern_error("(?P<©>x)")
+    check_pattern_error("(?P=©)")
+    check_pattern_error("(?(©)y)")
+    check_pattern_error(b"(?P<\xc2\xb5>x)")
+    check_pattern_error(b"(?P=\xc2\xb5)")
+    check_pattern_error(b"(?(\xc2\xb5)y)")
+
+
+def test_symbolic_refs():
+    assert re.sub("(?P<a>x)|(?P<b>y)", r"\g<b>", "xx") == ""
+    assert re.sub("(?P<a>x)|(?P<b>y)", r"\2", "xx") == ""
+    assert re.sub(b"(?P<a1>x)", rb"\g<a1>", b"xx") == b"xx"
+    # New valid identifiers in Python 3
+    assert re.sub("(?P<µ>x)", r"\g<µ>", "xx") == "xx"
+    assert re.sub("(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)", r"\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>", "xx") == "xx"
+    # Support > 100 groups.
+    pat = "|".join("x(?P<a%d>%x)y" % (i, i) for i in range(1, 200 + 1))
+    assert re.sub(pat, r"\g<200>", "xc8yzxc8y") == "c8zc8"
+
+
+def test_symbolic_refs_errors():
+    check_template_error("(?P<a>x)", r"\g<a", "xx")
+    check_template_error("(?P<a>x)", r"\g<", "xx")
+    check_template_error("(?P<a>x)", r"\g", "xx")
+    check_template_error("(?P<a>x)", r"\g<a a>", "xx")
+    check_template_error("(?P<a>x)", r"\g<>", "xx")
+    check_template_error("(?P<a>x)", r"\g<1a1>", "xx")
+    check_template_error("(?P<a>x)", r"\g<2>", "xx")
+    check_template_error("(?P<a>x)", r"\2", "xx")
+    check_template_error("(?P<a>x)", r"\g<ab>", "xx")
+    check_template_error("(?P<a>x)", r"\g<-1>", "xx")
+    check_template_error("(?P<a>x)", r"\g<+1>", "xx")
+    check_template_error("()" * 10, r"\g<1_0>", "xx")
+    check_template_error("(?P<a>x)", r"\g< 1 >", "xx")
+    check_template_error("(?P<a>x)", r"\g<©>", "xx")
+    check_template_error(b"(?P<a>x)", b"\\g<\xc2\xb5>", b"xx")
+    check_template_error("(?P<a>x)", r"\g<㊀>", "xx")
+    check_template_error("(?P<a>x)", r"\g<¹>", "xx")
+    check_template_error("(?P<a>x)", r"\g<१>", "xx")
+
+
+def test_re_subn():
+    assert re.subn("(?i)b+", "x", "bbbb BBBB") == ("x x", 2)
+    assert re.subn("b+", "x", "bbbb BBBB") == ("x BBBB", 1)
+    assert re.subn("b+", "x", "xyz") == ("xyz", 0)
+    assert re.subn("b*", "x", "xyz") == ("xxxyxzx", 4)
+    assert re.subn("b*", "x", "xyz", count=2) == ("xxxyz", 2)
+
+    with pytest.raises(TypeError):
+        re.subn("a", "b", "aaaaa", 1, count=1)
+    with pytest.raises(TypeError):
+        re.subn("a", "b", "aaaaa", 1, 0, flags=0)
+
+
+def test_re_split():
+    for string in (":a:b::c", S(":a:b::c")):
+        assert_typed_equal(re.split(":", string), ["", "a", "b", "", "c"])
+        assert_typed_equal(re.split(":+", string), ["", "a", "b", "c"])
+        assert_typed_equal(re.split("(:+)", string), ["", ":", "a", ":", "b", "::", "c"])
+    for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), memoryview(b":a:b::c")):
+        assert_typed_equal(re.split(b":", string), [b"", b"a", b"b", b"", b"c"])
+        assert_typed_equal(re.split(b":+", string), [b"", b"a", b"b", b"c"])
+        assert_typed_equal(re.split(b"(:+)", string), [b"", b":", b"a", b":", b"b", b"::", b"c"])
+    for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", "\U0001d49c\U0001d49e\U0001d4b5"):
+        string = ":%s:%s::%s" % (a, b, c)
+        assert re.split(":", string) == ["", a, b, "", c]
+        assert re.split(":+", string) == ["", a, b, c]
+        assert re.split("(:+)", string) == ["", ":", a, ":", b, "::", c]
+
+    assert re.split("(?::+)", ":a:b::c") == ["", "a", "b", "c"]
+    assert re.split("(:)+", ":a:b::c") == ["", ":", "a", ":", "b", ":", "c"]
+    assert re.split("([b:]+)", ":a:b::c") == ["", ":", "a", ":b::", "c"]
+    assert re.split("(b)|(:+)", ":a:b::c") == [
+        "",
+        None,
+        ":",
+        "a",
+        None,
+        ":",
+        "",
+        "b",
+        None,
+        "",
+        None,
+        "::",
+        "c",
+    ]
+    assert re.split("(?:b)|(?::+)", ":a:b::c") == ["", "a", "", "", "c"]
+
+    for sep, expected in [
+        (":*", ["", "", "a", "", "b", "", "c", ""]),
+        ("(?::*)", ["", "", "a", "", "b", "", "c", ""]),
+        ("(:*)", ["", ":", "", "", "a", ":", "", "", "b", "::", "", "", "c", "", ""]),
+        ("(:)*", ["", ":", "", None, "a", ":", "", None, "b", ":", "", None, "c", None, ""]),
+    ]:
+        assert_typed_equal(re.split(sep, ":a:b::c"), expected)
+
+    for sep, expected in [
+        ("", ["", ":", "a", ":", "b", ":", ":", "c", ""]),
+        (r"\b", [":", "a", ":", "b", "::", "c", ""]),
+        (r"(?=:)", ["", ":a", ":b", ":", ":c"]),
+        (r"(?<=:)", [":", "a:", "b:", ":", "c"]),
+    ]:
+        assert_typed_equal(re.split(sep, ":a:b::c"), expected)
+
+
+def test_qualified_re_split():
+    assert re.split(":", ":a:b::c", maxsplit=2) == ["", "a", "b::c"]
+    assert re.split(":", "a:b:c:d", maxsplit=2) == ["a", "b", "c:d"]
+    assert re.split("(:)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"]
+    assert re.split("(:+)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"]
+    assert re.split("(:*)", ":a:b::c", maxsplit=2) == ["", ":", "", "", "a:b::c"]
+
+    with pytest.raises(TypeError):
+        re.split(":", ":a:b::c", 2, maxsplit=2)
+    with pytest.raises(TypeError):
+        re.split(":", ":a:b::c", 2, 0, flags=0)
+
+
+def test_re_findall():
+    assert re.findall(":+", "abc") == []
+    for string in ("a:b::c:::d", S("a:b::c:::d")):
+        assert_typed_equal(re.findall(":+", string), [":", "::", ":::"])
+        assert_typed_equal(re.findall("(:+)", string), [":", "::", ":::"])
+        assert_typed_equal(re.findall("(:)(:*)", string), [(":", ""), (":", ":"), (":", "::")])
+    for string in (
+        b"a:b::c:::d",
+        B(b"a:b::c:::d"),
+        bytearray(b"a:b::c:::d"),
+        memoryview(b"a:b::c:::d"),
+    ):
+        assert_typed_equal(re.findall(b":+", string), [b":", b"::", b":::"])
+        assert_typed_equal(re.findall(b"(:+)", string), [b":", b"::", b":::"])
+        assert_typed_equal(
+            re.findall(b"(:)(:*)", string), [(b":", b""), (b":", b":"), (b":", b"::")]
+        )
+    for x in ("\xe0", "\u0430", "\U0001d49c"):
+        xx = x * 2
+        xxx = x * 3
+        string = "a%sb%sc%sd" % (x, xx, xxx)
+        assert re.findall("%s+" % x, string) == [x, xx, xxx]
+        assert re.findall("(%s+)" % x, string) == [x, xx, xxx]
+        assert re.findall("(%s)(%s*)" % (x, x), string), [(x, ""), (x, x) == (x, xx)]
+
+
+def test_bug_117612():
+    assert re.findall(r"(a|(b))", "aba"), [("a", ""), ("b", "b") == ("a", "")]
+
+
+def test_re_match():
+    for string in ("a", S("a")):
+        assert re.match("a", string).groups() == ()
+        assert re.match("(a)", string).groups() == ("a",)
+        assert re.match("(a)", string).group(0) == "a"
+        assert re.match("(a)", string).group(1) == "a"
+        assert re.match("(a)", string).group(1, 1) == ("a", "a")
+    for string in (b"a", B(b"a"), bytearray(b"a"), memoryview(b"a")):
+        assert re.match(b"a", string).groups() == ()
+        assert re.match(b"(a)", string).groups() == (b"a",)
+        assert re.match(b"(a)", string).group(0) == b"a"
+        assert re.match(b"(a)", string).group(1) == b"a"
+        assert re.match(b"(a)", string).group(1, 1) == (b"a", b"a")
+    for a in ("\xe0", "\u0430", "\U0001d49c"):
+        assert re.match(a, a).groups() == ()
+        assert re.match("(%s)" % a, a).groups() == (a,)
+        assert re.match("(%s)" % a, a).group(0) == a
+        assert re.match("(%s)" % a, a).group(1) == a
+        assert re.match("(%s)" % a, a).group(1, 1) == (a, a)
+
+    pat = re.compile("((a)|(b))(c)?")
+    assert pat.match("a").groups() == ("a", "a", None, None)
+    assert pat.match("b").groups() == ("b", None, "b", None)
+    assert pat.match("ac").groups() == ("a", "a", None, "c")
+    assert pat.match("bc").groups() == ("b", None, "b", "c")
+    assert pat.match("bc").groups("") == ("b", "", "b", "c")
+
+    pat = re.compile("(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?")
+    assert pat.match("a").group(1, 2, 3) == ("a", None, None)
+    assert pat.match("b").group("a1", "b2", "c3") == (None, "b", None)
+    assert pat.match("ac").group(1, "b2", 3) == ("a", None, "c")
+
+
+def test_group():
+    class Index:
+        def __init__(self, value):
+            self.value = value
+
+        def __index__(self):
+            return self.value
+
+    # A single group
+    m = re.match("(a)(b)", "ab")
+    assert m.group() == "ab"
+    assert m.group(0) == "ab"
+    assert m.group(1) == "a"
+    assert m.group(Index(1)) == "a"
+    assert_raises(IndexError, m.group, -1)
+    assert_raises(IndexError, m.group, 3)
+    assert_raises(IndexError, m.group, 1 << 1000)
+
+    # Unclear why the below fails
+    # assert_raises(IndexError, m.group, Index(1 << 1000))
+
+    assert_raises(IndexError, m.group, "x")
+    # Multiple groups
+    assert m.group(2, 1) == ("b", "a")
+    assert m.group(Index(2), Index(1)) == ("b", "a")
+
+
+def test_match_getitem():
+    pat = re.compile("(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?")
+
+    m = pat.match("a")
+    assert m["a1"] == "a"
+    assert m["b2"] == None
+    assert m["c3"] == None
+    assert "a1={a1} b2={b2} c3={c3}".format_map(m) == "a1=a b2=None c3=None"
+    assert m[0] == "a"
+    assert m[1] == "a"
+    assert m[2] == None
+    assert m[3] == None
+    with pytest.raises(IndexError):
+        m["X"]
+    with pytest.raises(IndexError):
+        m[-1]
+    with pytest.raises(IndexError):
+        m[4]
+    with pytest.raises(IndexError):
+        m[0, 1]
+    with pytest.raises(IndexError):
+        m[(0,)]
+    with pytest.raises(IndexError):
+        m[(0, 1)]
+    with pytest.raises(IndexError):
+        "a1={a2}".format_map(m)
+
+    m = pat.match("ac")
+    assert m["a1"] == "a"
+    assert m["b2"] == None
+    assert m["c3"] == "c"
+    assert "a1={a1} b2={b2} c3={c3}".format_map(m) == "a1=a b2=None c3=c"
+    assert m[0] == "ac"
+    assert m[1] == "a"
+    assert m[2] == None
+    assert m[3] == "c"
+
+    # Cannot assign.
+    with pytest.raises(TypeError):
+        m[0] = 1
+
+    # No len().
+    assert_raises(TypeError, len, m)
+
+
+def test_re_fullmatch():
+    # Issue 16203: Proposal: add re.fullmatch() method.
+    assert re.fullmatch(r"a", "a").span() == (0, 1)
+    for string in "ab", S("ab"):
+        assert re.fullmatch(r"a|ab", string).span() == (0, 2)
+    for string in (b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab")):
+        assert re.fullmatch(rb"a|ab", string).span() == (0, 2)
+    for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
+        r = r"%s|%s" % (a, a + b)
+        assert re.fullmatch(r, a + b).span() == (0, 2)
+    assert re.fullmatch(r".*?$", "abc").span() == (0, 3)
+    assert re.fullmatch(r".*?", "abc").span() == (0, 3)
+    assert re.fullmatch(r"a.*?b", "ab").span() == (0, 2)
+    assert re.fullmatch(r"a.*?b", "abb").span() == (0, 3)
+    assert re.fullmatch(r"a.*?b", "axxb").span() == (0, 4)
+    assert re.fullmatch(r"a+", "ab") is None
+    assert re.fullmatch(r"abc$", "abc\n") is None
+    assert re.fullmatch(r"abc\z", "abc\n") is None
+    assert re.fullmatch(r"abc\Z", "abc\n") is None
+    assert re.fullmatch(r"(?m)abc$", "abc\n") is None
+    assert re.fullmatch(r"ab(?=c)cd", "abcd").span() == (0, 4)
+    assert re.fullmatch(r"ab(?<=b)cd", "abcd").span() == (0, 4)
+    assert re.fullmatch(r"(?=a|ab)ab", "ab").span() == (0, 2)
+
+    assert re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3)
+    assert re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3)
+    assert re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3)
+
+
+def test_re_groupref_exists():
+    assert re.match(r"^(\()?([^()]+)(?(1)\))$", "(a)").groups() == ("(", "a")
+    assert re.match(r"^(\()?([^()]+)(?(1)\))$", "a").groups() == (None, "a")
+    assert re.match(r"^(\()?([^()]+)(?(1)\))$", "a)") is None
+    assert re.match(r"^(\()?([^()]+)(?(1)\))$", "(a") is None
+    assert re.match("^(?:(a)|c)((?(1)b|d))$", "ab").groups() == ("a", "b")
+    assert re.match(r"^(?:(a)|c)((?(1)b|d))$", "cd").groups() == (None, "d")
+    assert re.match(r"^(?:(a)|c)((?(1)|d))$", "cd").groups() == (None, "d")
+    assert re.match(r"^(?:(a)|c)((?(1)|d))$", "a").groups() == ("a", "")
+
+    # Tests for bug #1177831: exercise groups other than the first group
+    p = re.compile("(?P<g1>a)(?P<g2>b)?((?(g2)c|d))")
+    assert p.match("abc").groups() == ("a", "b", "c")
+    assert p.match("ad").groups() == ("a", None, "d")
+    assert p.match("abd") is None
+    assert p.match("ac") is None
+
+    # Support > 100 groups.
+    pat = "|".join("x(?P<a%d>%x)y" % (i, i) for i in range(1, 200 + 1))
+    pat = "(?:%s)(?(200)z)" % pat
+    assert re.match(pat, "xc8yz").span() == (0, 5)
+
+
+def test_re_groupref_exists_errors():
+    check_pattern_error(r"(?P<a>)(?(0)a|b)")
+    check_pattern_error(r"()(?(+1)a|b)")
+    check_pattern_error(r"()" * 10 + r"(?(1_0)a|b)")
+    check_pattern_error(r"()(?( 1 )a|b)")
+    check_pattern_error(r"()(?(㊀)a|b)")
+    check_pattern_error(r"()(?(¹)a|b)")
+    check_pattern_error(r"()(?(१)a|b)")
+    check_pattern_error(r"()(?(1")
+    check_pattern_error(r"()(?(1)a")
+    check_pattern_error(r"()(?(1)a|b")
+    check_pattern_error(r"()(?(1)a|b|c")
+    check_pattern_error(r"()(?(1)a|b|c)")
+    check_pattern_error(r"()(?(2)a)")
+
+
+def test_re_groupref_exists_validation_bug():
+    for i in range(256):
+        re.compile(r"()(?(1)\x%02x?)" % i)
+
+
+def test_re_groupref():
+    assert re.match(r"^(\|)?([^()]+)\1$", "|a|").groups() == ("|", "a")
+    assert re.match(r"^(\|)?([^()]+)\1?$", "a").groups() == (None, "a")
+    assert re.match(r"^(\|)?([^()]+)\1$", "a|") is None
+    assert re.match(r"^(\|)?([^()]+)\1$", "|a") is None
+    assert re.match(r"^(?:(a)|c)(\1)$", "aa").groups() == ("a", "a")
+    assert re.match(r"^(?:(a)|c)(\1)?$", "c").groups() == (None, None)
+
+
+def test_groupdict():
+    assert re.match("(?P<first>first) (?P<second>second)", "first second").groupdict() == {
+        "first": "first",
+        "second": "second",
+    }
+
+
+def test_expand():
+    assert (
+        re.match("(?P<first>first) (?P<second>second)", "first second").expand(
+            r"\2 \1 \g<second> \g<first>"
+        )
+        == "second first second first"
+    )
+    assert re.match("(?P<first>first)|(?P<second>second)", "first").expand(r"\2 \g<second>") == " "
+
+
+def test_repeat_minmax():
+    assert re.match(r"^(\w){1}$", "abc") is None
+    assert re.match(r"^(\w){1}?$", "abc") is None
+    assert re.match(r"^(\w){1,2}$", "abc") is None
+    assert re.match(r"^(\w){1,2}?$", "abc") is None
+
+    assert re.match(r"^(\w){3}$", "abc").group(1) == "c"
+    assert re.match(r"^(\w){1,3}$", "abc").group(1) == "c"
+    assert re.match(r"^(\w){1,4}$", "abc").group(1) == "c"
+    assert re.match(r"^(\w){3,4}?$", "abc").group(1) == "c"
+    assert re.match(r"^(\w){3}?$", "abc").group(1) == "c"
+    assert re.match(r"^(\w){1,3}?$", "abc").group(1) == "c"
+    assert re.match(r"^(\w){1,4}?$", "abc").group(1) == "c"
+    assert re.match(r"^(\w){3,4}?$", "abc").group(1) == "c"
+
+    assert re.match(r"^x{1}$", "xxx") is None
+    assert re.match(r"^x{1}?$", "xxx") is None
+    assert re.match(r"^x{1,2}$", "xxx") is None
+    assert re.match(r"^x{1,2}?$", "xxx") is None
+
+    assert re.match(r"^x{3}$", "xxx")
+    assert re.match(r"^x{1,3}$", "xxx")
+    assert re.match(r"^x{3,3}$", "xxx")
+    assert re.match(r"^x{1,4}$", "xxx")
+    assert re.match(r"^x{3,4}?$", "xxx")
+    assert re.match(r"^x{3}?$", "xxx")
+    assert re.match(r"^x{1,3}?$", "xxx")
+    assert re.match(r"^x{1,4}?$", "xxx")
+    assert re.match(r"^x{3,4}?$", "xxx")
+
+    assert re.match(r"^x{}$", "xxx") is None
+    assert re.match(r"^x{}$", "x{}")
+
+    check_pattern_error(r"x{2,1}")
+
+
+def test_getattr():
+    assert re.compile("(?i)(a)(b)").pattern == "(?i)(a)(b)"
+    # assert re.compile("(?i)(a)(b)").flags ==  re.I | re.U  # TODO: Look into why not
+    assert re.compile("(?i)(a)(b)").groups == 2
+    assert re.compile("(?i)(a)(b)").groupindex == {}
+    assert re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex == {"first": 1, "other": 2}
+
+    assert re.match("(a)", "a").pos == 0
+    assert re.match("(a)", "a").endpos == 1
+    assert re.match("(a)", "a").string == "a"
+    assert re.match("(a)", "a").re
+
+    # Issue 14260. groupindex should be non-modifiable mapping.
+    p = re.compile(r"(?i)(?P<first>a)(?P<other>b)")
+    assert sorted(p.groupindex) == ["first", "other"]
+    assert p.groupindex["other"] == 2
+
+    with pytest.raises(TypeError):
+        p.groupindex["other"] = 0
+
+    assert p.groupindex["other"] == 2
+
+
+def test_special_escapes():
+    assert re.search(r"\b(b.)\b", "abcd abc bcd bx").group(1) == "bx"
+    assert re.search(r"\B(b.)\B", "abc bcd bc abxd").group(1) == "bx"
+
+    # TODO: Add ASCII
+    assert re.search(r"\b(b.)\b", "abcd abc bcd bx", re.ASCII).group(1) == "bx"
+    assert re.search(r"\B(b.)\B", "abc bcd bc abxd", re.ASCII).group(1) == "bx"
+
+    assert re.search(r"^abc$", "\nabc\n", re.M).group(0) == "abc"
+    assert re.search(r"^\Aabc\z$", "abc", re.M).group(0) == "abc"
+    assert re.search(r"^\Aabc\z$", "\nabc\n", re.M) is None
+    assert re.search(r"^\Aabc\Z$", "abc", re.M).group(0) == "abc"
+    assert re.search(r"^\Aabc\Z$", "\nabc\n", re.M) is None
+    assert re.search(rb"\b(b.)\b", b"abcd abc bcd bx").group(1) == b"bx"
+    assert re.search(rb"\B(b.)\B", b"abc bcd bc abxd").group(1) == b"bx"
+    assert re.search(rb"^abc$", b"\nabc\n", re.M).group(0) == b"abc"
+    assert re.search(rb"^\Aabc\z$", b"abc", re.M).group(0) == b"abc"
+    assert re.search(rb"^\Aabc\z$", b"\nabc\n", re.M) is None
+    assert re.search(rb"^\Aabc\Z$", b"abc", re.M).group(0) == b"abc"
+    assert re.search(rb"^\Aabc\Z$", b"\nabc\n", re.M) is None
+    assert re.search(r"\d\D\w\W\s\S", "1aa! a").group(0) == "1aa! a"
+    assert re.search(rb"\d\D\w\W\s\S", b"1aa! a").group(0) == b"1aa! a"
+    assert re.search(r"\d\D\w\W\s\S", "1aa! a", re.ASCII).group(0) == "1aa! a"
+
+
+def test_other_escapes():
+    check_pattern_error("\\")
+
+    assert re.match(r"\(", "(").group() == "("
+    assert re.match(r"\(", ")") is None
+    assert re.match(r"\\", "\\").group() == "\\"
+    assert re.match(r"[\]]", "]").group() == "]"
+    assert re.match(r"[\]]", "[") is None
+    assert re.match(r"[a\-c]", "-").group() == "-"
+    assert re.match(r"[a\-c]", "b") is None
+    assert re.match(r"[\^a]+", "a^").group() == "a^"
+    assert re.match(r"[\^a]+", "b") is None
+
+    for c in "cijlmopqyCFIJLMOPTY":
+        check_pattern_error("\\%c" % c)
+    for c in "cijlmopqyzABCFIJLMOPTYZ":
+        check_pattern_error("[\\%c]" % c)
+
+
+def test_word_boundaries():
+    # See http://bugs.python.org/issue10713
+    assert re.search(r"\b(abc)\b", "abc").group(1) == "abc"
+    assert re.search(r"\b(abc)\b", "abc", re.ASCII).group(1) == "abc"
+    assert re.search(rb"\b(abc)\b", b"abc").group(1) == b"abc"
+    assert re.search(r"\b(ьюя)\b", "ьюя").group(1) == "ьюя"
+    assert re.search(r"\b(ьюя)\b", "ьюя", re.ASCII) is None
+    # There's a word boundary between a word and a non-word.
+    assert re.match(r".\b", "a=")
+    assert re.match(r".\b", "a=", re.ASCII)
+    assert re.match(rb".\b", b"a=")
+    assert re.match(r".\b", "я=")
+    assert re.match(r".\b", "я=", re.ASCII) is None
+    # There's a word boundary between a non-word and a word.
+    assert re.match(r".\b", "=a")
+    assert re.match(r".\b", "=a", re.ASCII)
+    assert re.match(rb".\b", b"=a")
+    assert re.match(r".\b", "=я")
+    assert re.match(r".\b", "=я", re.ASCII) is None
+    # There is no word boundary inside a word.
+    assert re.match(r".\b", "ab") is None
+    assert re.match(r".\b", "ab", re.ASCII) is None
+    assert re.match(rb".\b", b"ab") is None
+    assert re.match(r".\b", "юя") is None
+    assert re.match(r".\b", "юя", re.ASCII) is None
+    # There is no word boundary between a non-word characters.
+    assert re.match(r".\b", "=-") is None
+    assert re.match(r".\b", "=-", re.ASCII) is None
+    assert re.match(rb".\b", b"=-") is None
+    # There is no non-boundary match between a word and a non-word.
+    assert re.match(r".\B", "a=") is None
+    assert re.match(r".\B", "a=", re.ASCII) is None
+    assert re.match(rb".\B", b"a=") is None
+    assert re.match(r".\B", "я=") is None
+    assert re.match(r".\B", "я=", re.ASCII)
+    # There is no non-boundary match between a non-word and a word.
+    assert re.match(r".\B", "=a") is None
+    assert re.match(r".\B", "=a", re.ASCII) is None
+    assert re.match(rb".\B", b"=a") is None
+    assert re.match(r".\B", "=я") is None
+    assert re.match(r".\B", "=я", re.ASCII)
+    # There's a non-boundary match inside a word.
+    assert re.match(r".\B", "ab")
+    assert re.match(r".\B", "ab", re.ASCII)
+    assert re.match(rb".\B", b"ab")
+    assert re.match(r".\B", "юя")
+    assert re.match(r".\B", "юя", re.ASCII)
+    # There's a non-boundary match between a non-word characters.
+    assert re.match(r".\B", "=-")
+    assert re.match(r".\B", "=-", re.ASCII)
+    assert re.match(rb".\B", b"=-")
+    # There's a word boundary at the start of a string.
+    assert re.match(r"\b", "abc")
+    assert re.match(r"\b", "abc", re.ASCII)
+    assert re.match(rb"\b", b"abc")
+    assert re.match(r"\b", "ьюя")
+    assert re.match(r"\b", "ьюя", re.ASCII) is None
+    # There's a word boundary at the end of a string.
+    assert re.fullmatch(r".+\b", "abc")
+    assert re.fullmatch(r".+\b", "abc", re.ASCII)
+    assert re.fullmatch(rb".+\b", b"abc")
+    assert re.fullmatch(r".+\b", "ьюя")
+    assert re.search(r"\b", "ьюя", re.ASCII) is None
+    # A non-empty string includes a non-boundary zero-length match.
+    assert re.search(r"\B", "abc").span() == (1, 1)
+    assert re.search(r"\B", "abc", re.ASCII).span() == (1, 1)
+    assert re.search(rb"\B", b"abc").span() == (1, 1)
+    assert re.search(r"\B", "ьюя").span() == (1, 1)
+    assert re.search(r"\B", "ьюя", re.ASCII).span() == (0, 0)
+    # There is no non-boundary match at the start of a string.
+    assert re.match(r"\B", "abc") is None
+    assert re.match(r"\B", "abc", re.ASCII) is None
+    assert re.match(rb"\B", b"abc") is None
+    assert re.match(r"\B", "ьюя") is None
+    assert re.match(r"\B", "ьюя", re.ASCII)
+    # There is no non-boundary match at the end of a string.
+    assert re.fullmatch(r".+\B", "abc") is None
+    assert re.fullmatch(r".+\B", "abc", re.ASCII) is None
+    assert re.fullmatch(rb".+\B", b"abc") is None
+    assert re.fullmatch(r".+\B", "ьюя") is None
+    assert re.fullmatch(r".+\B", "ьюя", re.ASCII)
+    # However, an empty string contains no word boundaries.
+    assert re.search(r"\b", "") is None
+    assert re.search(r"\b", "", re.ASCII) is None
+    assert re.search(rb"\b", b"") is None
+    assert re.search(r"\B", "")
+    assert re.search(r"\B", "", re.ASCII)
+    assert re.search(rb"\B", b"")
+    # A single word-character string has two boundaries, but no
+    # non-boundary gaps.
+    assert len(re.findall(r"\b", "a")) == 2
+    assert len(re.findall(r"\b", "a", re.ASCII)) == 2
+    assert len(re.findall(rb"\b", b"a")) == 2
+    assert len(re.findall(r"\B", "a")) == 0
+    assert len(re.findall(r"\B", "a", re.ASCII)) == 0
+    assert len(re.findall(rb"\B", b"a")) == 0
+    # If there are no words, there are no boundaries
+    assert len(re.findall(r"\b", " ")) == 0
+    assert len(re.findall(r"\b", " ", re.ASCII)) == 0
+    assert len(re.findall(rb"\b", b" ")) == 0
+    assert len(re.findall(r"\b", "   ")) == 0
+    assert len(re.findall(r"\b", "   ", re.ASCII)) == 0
+    assert len(re.findall(rb"\b", b"   ")) == 0
+    # Can match around the whitespace.
+    assert len(re.findall(r"\B", " ")) == 2
+    assert len(re.findall(r"\B", " ", re.ASCII)) == 2
+    assert len(re.findall(rb"\B", b" ")) == 2
+
+
+def test_bigcharset():
+    assert re.match("([\u2222\u2223])", "\u2222").group(1) == "\u2222"
+
+
+def test_big_codesize():
+    # Issue #1160
+    r = re.compile("|".join(("%d" % x for x in range(5000))))
+    assert r.match("1000")
+    assert r.match("9999")
+
+
+def test_anyall():
+    assert re.match("a.b", "a\nb", re.DOTALL).group(0) == "a\nb"
+    assert re.match("a.*b", "a\n\nb", re.DOTALL).group(0) == "a\n\nb"
+
+
+def test_lookahead():
+    assert re.match(r"(a(?=\s[^a]))", "a b").group(1) == "a"
+    assert re.match(r"(a(?=\s[^a]*))", "a b").group(1) == "a"
+    assert re.match(r"(a(?=\s[abc]))", "a b").group(1) == "a"
+    assert re.match(r"(a(?=\s[abc]*))", "a bc").group(1) == "a"
+    assert re.match(r"(a)(?=\s\1)", "a a").group(1) == "a"
+    assert re.match(r"(a)(?=\s\1*)", "a aa").group(1) == "a"
+    assert re.match(r"(a)(?=\s(abc|a))", "a a").group(1) == "a"
+
+    assert re.match(r"(a(?!\s[^a]))", "a a").group(1) == "a"
+    assert re.match(r"(a(?!\s[abc]))", "a d").group(1) == "a"
+    assert re.match(r"(a)(?!\s\1)", "a b").group(1) == "a"
+    assert re.match(r"(a)(?!\s(abc|a))", "a b").group(1) == "a"
+
+    # Group reference.
+    assert re.match(r"(a)b(?=\1)a", "aba")
+    assert re.match(r"(a)b(?=\1)c", "abac") is None
+    # Conditional group reference.
+    assert re.match(r"(?:(a)|(x))b(?=(?(2)x|c))c", "abc")
+    assert re.match(r"(?:(a)|(x))b(?=(?(2)c|x))c", "abc") is None
+    assert re.match(r"(?:(a)|(x))b(?=(?(2)x|c))c", "abc")
+    assert re.match(r"(?:(a)|(x))b(?=(?(1)b|x))c", "abc") is None
+    assert re.match(r"(?:(a)|(x))b(?=(?(1)c|x))c", "abc")
+    # Group used before defined.
+    assert re.match(r"(a)b(?=(?(2)x|c))(c)", "abc")
+    assert re.match(r"(a)b(?=(?(2)b|x))(c)", "abc") is None
+    assert re.match(r"(a)b(?=(?(1)c|x))(c)", "abc")
+
+
+def test_lookbehind():
+    assert re.match(r"ab(?<=b)c", "abc")
+    assert re.match(r"ab(?<=c)c", "abc") is None
+    assert re.match(r"ab(?<!b)c", "abc") is None
+    assert re.match(r"ab(?<!c)c", "abc")
+    # Group reference.
+    assert re.match(r"(a)a(?<=\1)c", "aac")
+    assert re.match(r"(a)b(?<=\1)a", "abaa") is None
+    assert re.match(r"(a)a(?<!\1)c", "aac") is None
+    assert re.match(r"(a)b(?<!\1)a", "abaa")
+    # Conditional group reference.
+    assert re.match(r"(?:(a)|(x))b(?<=(?(2)x|c))c", "abc") is None
+    assert re.match(r"(?:(a)|(x))b(?<=(?(2)b|x))c", "abc") is None
+    assert re.match(r"(?:(a)|(x))b(?<=(?(2)x|b))c", "abc")
+    assert re.match(r"(?:(a)|(x))b(?<=(?(1)c|x))c", "abc") is None
+    assert re.match(r"(?:(a)|(x))b(?<=(?(1)b|x))c", "abc")
+    # Group used before defined.
+    assert re.match(r"(a)b(?<=(?(1)c|x))(c)", "abc") is None
+    assert re.match(r"(a)b(?<=(?(1)b|x))(c)", "abc")
+
+
+def test_ignore_case():
+    assert re.match("abc", "ABC", re.I).group(0) == "ABC"
+    assert re.match(b"abc", b"ABC", re.I).group(0) == b"ABC"
+    assert re.match(r"(a\s[^a])", "a b", re.I).group(1) == "a b"
+    assert re.match(r"(a\s[^a]*)", "a bb", re.I).group(1) == "a bb"
+    assert re.match(r"(a\s[abc])", "a b", re.I).group(1) == "a b"
+    assert re.match(r"(a\s[abc]*)", "a bb", re.I).group(1) == "a bb"
+    assert re.match(r"((a)\s\2)", "a a", re.I).group(1) == "a a"
+    assert re.match(r"((a)\s\2*)", "a aa", re.I).group(1) == "a aa"
+    assert re.match(r"((a)\s(abc|a))", "a a", re.I).group(1) == "a a"
+    assert re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1) == "a aa"
+
+    # Two different characters have the same lowercase.
+    assert "K".lower() == "\u212a".lower() == "k"  # 'K'
+    assert re.match(r"K", "\u212a", re.I)
+    assert re.match(r"k", "\u212a", re.I)
+    assert re.match(r"\N{U+212a}", "K", re.I)
+    assert re.match(r"\N{U+212a}", "k", re.I)
+
+    # Two different characters have the same uppercase.
+    assert "s".upper() == "\u017f".upper() == "S"  # 'ſ'
+    assert re.match(r"S", "\u017f", re.I)
+    assert re.match(r"s", "\u017f", re.I)
+    assert re.match(r"\u017f", "S", re.I)
+    assert re.match(r"\u017f", "s", re.I)
+
+    # Two different characters have the same uppercase. Unicode 9.0+.
+    assert "\u0432".upper() == "\u1c80".upper() == "\u0412"  # 'в', 'ᲀ', 'В'
+    assert re.match(r"\u0412", "\u0432", re.I)
+    assert re.match(r"\u0412", "\u1c80", re.I)
+    assert re.match(r"\u0432", "\u0412", re.I)
+    assert re.match(r"\u0432", "\u1c80", re.I)
+    assert re.match(r"\u1c80", "\u0412", re.I)
+    assert re.match(r"\u1c80", "\u0432", re.I)
+
+    # Two different characters have the same multicharacter uppercase.
+    assert "\ufb05".upper() == "\ufb06".upper() == "ST"  # 'ſt', 'st'
+    assert re.match(r"\ufb05", "\ufb06", re.I)
+    assert re.match(r"\ufb06", "\ufb05", re.I)
+
+
+def test_ignore_case_set():
+    assert re.match(r"[19A]", "A", re.I)
+    assert re.match(r"[19a]", "a", re.I)
+    assert re.match(r"[19a]", "A", re.I)
+    assert re.match(r"[19A]", "a", re.I)
+    assert re.match(rb"[19A]", b"A", re.I)
+    assert re.match(rb"[19a]", b"a", re.I)
+    assert re.match(rb"[19a]", b"A", re.I)
+    assert re.match(rb"[19A]", b"a", re.I)
+    assert re.match(r"[19\xc7]", "\xc7", re.I)
+    assert re.match(r"[19\xc7]", "\xe7", re.I)
+    assert re.match(r"[19\xe7]", "\xc7", re.I)
+    assert re.match(r"[19\xe7]", "\xe7", re.I)
+    assert re.match(r"[19\u0400]", "\u0400", re.I)
+    assert re.match(r"[19\u0400]", "\u0450", re.I)
+    assert re.match(r"[19\u0450]", "\u0400", re.I)
+    assert re.match(r"[19\u0450]", "\u0450", re.I)
+
+    assert re.match(rb"[19A]", b"A", re.I)
+    assert re.match(rb"[19a]", b"a", re.I)
+    assert re.match(rb"[19a]", b"A", re.I)
+    assert re.match(rb"[19A]", b"a", re.I)
+
+    # Two different characters have the same lowercase.
+    assert "K".lower() == "\u212a".lower() == "k"  # 'K'
+    assert re.match(r"[19K]", "\u212a", re.I)
+    assert re.match(r"[19k]", "\u212a", re.I)
+    assert re.match(r"[19\u212a]", "K", re.I)
+    assert re.match(r"[19\u212a]", "k", re.I)
+
+    # Two different characters have the same uppercase.
+    assert "s".upper() == "\u017f".upper() == "S"  # 'ſ'
+    assert re.match(r"[19S]", "\u017f", re.I)
+    assert re.match(r"[19s]", "\u017f", re.I)
+    assert re.match(r"[19\u017f]", "S", re.I)
+    assert re.match(r"[19\u017f]", "s", re.I)
+
+    # Two different characters have the same uppercase. Unicode 9.0+.
+    assert "\u0432".upper() == "\u1c80".upper() == "\u0412"  # 'в', 'ᲀ', 'В'
+    assert re.match(r"[19\u0412]", "\u0432", re.I)
+    assert re.match(r"[19\u0412]", "\u1c80", re.I)
+    assert re.match(r"[19\u0432]", "\u0412", re.I)
+    assert re.match(r"[19\u0432]", "\u1c80", re.I)
+    assert re.match(r"[19\u1c80]", "\u0412", re.I)
+    assert re.match(r"[19\u1c80]", "\u0432", re.I)
+
+    # Two different characters have the same multicharacter uppercase.
+    assert "\ufb05".upper() == "\ufb06".upper() == "ST"  # 'ſt', 'st'
+    assert re.match(r"[19\ufb05]", "\ufb06", re.I)
+    assert re.match(r"[19\ufb06]", "\ufb05", re.I)
+
+
+def test_ignore_case_range():
+    # Issues #3511, #17381.
+    assert re.match(r"[9-a]", "_", re.I)
+    assert re.match(r"[9-A]", "_", re.I) is None
+    assert re.match(rb"[9-a]", b"_", re.I)
+    assert re.match(rb"[9-A]", b"_", re.I) is None
+    assert re.match(r"[\xc0-\xde]", "\xd7", re.I)
+    assert re.match(r"[\xc0-\xde]", "\xe7", re.I)
+    assert re.match(r"[\xc0-\xde]", "\xf7", re.I) is None
+    assert re.match(r"[\xe0-\xfe]", "\xf7", re.I)
+    assert re.match(r"[\xe0-\xfe]", "\xc7", re.I)
+    assert re.match(r"[\xe0-\xfe]", "\xd7", re.I) is None
+    assert re.match(r"[\u0430-\u045f]", "\u0450", re.I)
+    assert re.match(r"[\u0430-\u045f]", "\u0400", re.I)
+    assert re.match(r"[\u0400-\u042f]", "\u0450", re.I)
+    assert re.match(r"[\u0400-\u042f]", "\u0400", re.I)
+
+    assert re.match(r"[N-\x7f]", "A", re.I | re.A)
+    assert re.match(r"[n-\x7f]", "Z", re.I | re.A)
+    assert re.match(r"[N-\uffff]", "A", re.I | re.A)
+    assert re.match(r"[n-\uffff]", "Z", re.I | re.A)
+
+    # Two different characters have the same lowercase.
+    assert "K".lower() == "\u212a".lower() == "k"  # 'K'
+    assert re.match(r"[J-M]", "\u212a", re.I)
+    assert re.match(r"[j-m]", "\u212a", re.I)
+    assert re.match(r"[\u2129-\u212b]", "K", re.I)
+    assert re.match(r"[\u2129-\u212b]", "k", re.I)
+
+    # Two different characters have the same uppercase.
+    assert "s".upper() == "\u017f".upper() == "S"  # 'ſ'
+    assert re.match(r"[R-T]", "\u017f", re.I)
+    assert re.match(r"[r-t]", "\u017f", re.I)
+    assert re.match(r"[\u017e-\u0180]", "S", re.I)
+    assert re.match(r"[\u017e-\u0180]", "s", re.I)
+
+    # Two different characters have the same uppercase. Unicode 9.0+.
+    assert "\u0432".upper() == "\u1c80".upper() == "\u0412"  # 'в', 'ᲀ', 'В'
+    assert re.match(r"[\u0411-\u0413]", "\u0432", re.I)
+    assert re.match(r"[\u0411-\u0413]", "\u1c80", re.I)
+    assert re.match(r"[\u0431-\u0433]", "\u0412", re.I)
+    assert re.match(r"[\u0431-\u0433]", "\u1c80", re.I)
+    assert re.match(r"[\u1c80-\u1c82]", "\u0412", re.I)
+    assert re.match(r"[\u1c80-\u1c82]", "\u0432", re.I)
+
+    # Two different characters have the same multicharacter uppercase.
+    assert "\ufb05".upper() == "\ufb06".upper() == "ST"  # 'ſt', 'st'
+    assert re.match(r"[\ufb04-\ufb05]", "\ufb06", re.I)
+    assert re.match(r"[\ufb06-\ufb07]", "\ufb05", re.I)
+
+
+def test_category():
+    assert re.match(r"(\s)", " ").group(1) == " "
+
+
+def test_not_literal():
+    assert re.search(r"\s([^a])", " b").group(1) == "b"
+    assert re.search(r"\s([^a]*)", " bb").group(1) == "bb"
+
+
+def test_possible_set_operations():
+    s = bytes(range(128)).decode()
+    assert re.findall(r"[0-9--1]", s) == list("-./0123456789")
+    assert re.findall(r"[0-9--2]", s) == list("-./0123456789")
+    assert re.findall(r"[--1]", s) == list("-./01")
+    assert re.findall(r"[%--1]", s) == list("%&'()*+,-1")
+    assert re.findall(r"[%--]", s) == list("%&'()*+,-")
+    assert re.findall(r"[0-9&&1]", s) == list("&0123456789")
+    assert re.findall(r"[0-8&&1]", s) == list("&012345678")
+    assert re.findall(r"[\d&&1]", s) == list("&0123456789")
+    assert re.findall(r"[&&1]", s) == list("&1")
+    assert re.findall(r"[0-9||a]", s) == list("0123456789a|")
+    assert re.findall(r"[\d||a]", s) == list("0123456789a|")
+    assert re.findall(r"[||1]", s) == list("1|")
+    assert re.findall(r"[0-9~~1]", s) == list("0123456789~")
+    assert re.findall(r"[\d~~1]", s) == list("0123456789~")
+    assert re.findall(r"[~~1]", s) == list("1~")
+    assert re.findall(r"[[0-9]|]", s) == list("0123456789[]")
+    assert re.findall(r"[[0-8]|]", s) == list("012345678[]")
+    assert re.findall(r"[[:digit:]|]", s) == list("0123456789|")
+
+
+def test_search_coverage():
+    assert re.search(r"\s(b)", " b").group(1) == "b"
+    assert re.search(r"a\s", "a ").group(0) == "a "
+
+
+def test_pickling():
+    import pickle
+
+    oldpat = re.compile("a(?:b|(c|e){1,2}?|d)+?(.)", re.UNICODE)
+    for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+        pickled = pickle.dumps(oldpat, proto)
+        newpat = pickle.loads(pickled)
+        assert newpat.pattern == oldpat.pattern
+    # current pickle expects the _compile() reconstructor in re module
+    from re import _compile  # noqa: F401
+
+
+def test_constants():
+    assert re.I == re.IGNORECASE
+    assert re.M == re.MULTILINE
+    assert re.S == re.DOTALL
+    assert re.X == re.VERBOSE
+
+
+def test_flags():
+    for flag in [re.I, re.M, re.X, re.S, re.U]:  # TODO: Add re.A back
+        assert re.compile("^pattern$", flag)
+    for flag in [re.I, re.M, re.X, re.S]:  # TODO: Add re.A, re.L back
+        assert re.compile(b"^pattern$", flag)
+
+
+def test_character_set_errors():
+    check_pattern_error(r"[")
+    check_pattern_error(r"[^")
+    check_pattern_error(r"[a")
+    # bug 545855 -- This pattern failed to cause a compile error as it
+    # should, instead provoking a TypeError.
+    check_pattern_error(r"[a-")
+    check_pattern_error(r"[\w-b]")
+    check_pattern_error(r"[a-\w]")
+    check_pattern_error(r"[b-a]")
+
+
+def test_bug_113254():
+    assert re.match(r"(a)|(b)", "b").start(1) == -1
+    assert re.match(r"(a)|(b)", "b").end(1) == -1
+    assert re.match(r"(a)|(b)", "b").span(1) == (-1, -1)
+
+
+def test_bug_527371():
+    # bug described in patches 527371/672491
+    assert re.match(r"(a)?a", "a").lastindex is None
+    assert re.match(r"(a)(b)?b", "ab").lastindex == 1
+    assert re.match(r"(?P<a>a)(?P<b>b)?b", "ab").lastgroup == "a"
+    assert re.match(r"(?P<a>a(b))", "ab").lastgroup == "a"
+    assert re.match(r"((a))", "a").lastindex == 1
+
+
+def test_bug_418626():
+    # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
+    # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
+    # pattern '*?' on a long string.
+    assert re.match(".*?c", 10000 * "ab" + "cd").end(0) == 20001
+    assert re.match(".*?cd", 5000 * "ab" + "c" + 5000 * "ab" + "cde").end(0) == 20003
+    assert re.match(".*?cd", 20000 * "abc" + "de").end(0) == 60001
+    # non-simple '*?' still used to hit the recursion limit, before the
+    # non-recursive scheme was implemented.
+    assert re.search("(a|b)*?c", 10000 * "ab" + "cd", jit=False).end(0) == 20001
+
+
+def test_stack_overflow():
+    # nasty cases that used to overflow the straightforward recursive
+    # implementation of repeated groups.
+    assert re.match("(x)*", 50000 * "x").group(1) == "x"
+    assert re.match("(x)*y", 50000 * "x" + "y").group(1) == "x"
+    assert re.match("(x)*?y", 50000 * "x" + "y").group(1) == "x"
+
+
+def test_nothing_to_repeat():
+    for reps in "*", "+", "?", "{1,2}":
+        for mod in "", "?":
+            check_pattern_error("%s%s" % (reps, mod))
+            check_pattern_error("(?:%s%s)" % (reps, mod))
+
+
+def test_multiple_repeat():
+    for outer_reps in "*", "+", "?", "{1,2}":
+        for outer_mod in "", "?", "+":
+            outer_op = outer_reps + outer_mod
+            for inner_reps in "*", "+", "?", "{1,2}":
+                for inner_mod in "", "?", "+":
+                    if inner_mod + outer_reps in ("?", "+"):
+                        continue
+                    inner_op = inner_reps + inner_mod
+                    check_pattern_error(r"x%s%s" % (inner_op, outer_op))
+
+
+def test_unlimited_zero_width_repeat():
+    # Issue #9669
+    assert re.match(r"(?:a?)*y", "z") is None
+    assert re.match(r"(?:a?)+y", "z") is None
+    assert re.match(r"(?:a?){2,}y", "z") is None
+    assert re.match(r"(?:a?)*?y", "z") is None
+    assert re.match(r"(?:a?)+?y", "z") is None
+    assert re.match(r"(?:a?){2,}?y", "z") is None
+
+
+def test_bug_448951():
+    # bug 448951 (similar to 429357, but with single char match)
+    # (Also test greedy matches.)
+    for op in "", "?", "*":
+        assert re.match(r"((.%s):)?z" % op, "z").groups() == (None, None)
+        assert re.match(r"((.%s):)?z" % op, "a:z").groups() == ("a:", "a")
+
+
+def test_bug_725106():
+    # capturing groups in alternatives in repeats
+    assert re.match("^((a)|b)*", "abc").groups() == ("b", "a")
+    assert re.match("^(([ab])|c)*", "abc").groups() == ("c", "b")
+    assert re.match("^((d)|[ab])*", "abc").groups() == ("b", None)
+    assert re.match("^((a)c|[ab])*", "abc").groups() == ("b", None)
+    assert re.match("^((a)|b)*?c", "abc").groups() == ("b", "a")
+    assert re.match("^(([ab])|c)*?d", "abcd").groups() == ("c", "b")
+    assert re.match("^((d)|[ab])*?c", "abc").groups() == ("b", None)
+    assert re.match("^((a)c|[ab])*?c", "abc").groups() == ("b", None)
+
+
+def test_bug_725149():
+    # mark_stack_base restoring before restoring marks
+    assert re.match("(a)(?:(?=(b)*)c)*", "abb").groups() == ("a", None)
+    assert re.match("(a)((?!(b)*))*", "abb").groups() == ("a", None, None)
+
+
+def test_bug_764548():
+    # bug 764548, re.compile() barfs on str/unicode subclasses
+    class my_unicode(str):
+        pass
+
+    pat = re.compile(my_unicode("abc"))
+    assert pat.match("xyz") is None
+
+
+def test_finditer():
+    iter = re.finditer(r":+", "a:b::c:::d")
+    assert [item.group(0) for item in iter] == [":", "::", ":::"]
+
+    pat = re.compile(r":+")
+    iter = pat.finditer("a:b::c:::d", 1, 10)
+    assert [item.group(0) for item in iter] == [":", "::", ":::"]
+
+    pat = re.compile(r":+")
+    iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
+    assert [item.group(0) for item in iter] == [":", "::", ":::"]
+
+    pat = re.compile(r":+")
+    iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
+    assert [item.group(0) for item in iter] == [":", "::", ":::"]
+
+    pat = re.compile(r":+")
+    iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
+    assert [item.group(0) for item in iter] == ["::", "::"]
+
+
+def test_bug_926075():
+    assert re.compile("bug_926075") is not re.compile(b"bug_926075")
+
+
+def test_bug_931848():
+    pattern = "[\u002e\u3002\uff0e\uff61]"
+    assert re.compile(pattern).split("a.b.c") == ["a", "b", "c"]
+
+
+def test_bug_581080():
+    iter = re.finditer(r"\s", "a b")
+    assert next(iter).span() == (1, 2)
+    assert_raises(StopIteration, next, iter)
+
+
+def test_bug_817234():
+    iter = re.finditer(r".*", "asdf")
+    assert next(iter).span() == (0, 4)
+    assert next(iter).span() == (4, 4)
+    assert_raises(StopIteration, next, iter)
+
+
+def test_bug_6561():
+    # '\d' should match characters in Unicode category 'Nd'
+    # (Number, Decimal Digit), but not those in 'Nl' (Number,
+    # Letter) or 'No' (Number, Other).
+    decimal_digits = [
+        "\u0037",  # '\N{DIGIT SEVEN}', category 'Nd'
+        "\u0e58",  # '\N{THAI DIGIT SIX}', category 'Nd'
+        "\uff10",  # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
+    ]
+    for x in decimal_digits:
+        assert re.match(r"^\d$", x).group(0) == x
+
+    not_decimal_digits = [
+        "\u2165",  # '\N{ROMAN NUMERAL SIX}', category 'Nl'
+        "\u3039",  # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
+        "\u2082",  # '\N{SUBSCRIPT TWO}', category 'No'
+        "\u32b4",  # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
+    ]
+    for x in not_decimal_digits:
+        assert re.match(r"^\d$", x) is None
+
+
+def test_inline_flags():
+    # Bug #1700
+    upper_char = "\u1ea0"  # Latin Capital Letter A with Dot Below
+    lower_char = "\u1ea1"  # Latin Small Letter A with Dot Below
+
+    p = re.compile("." + upper_char, re.I | re.S)
+    q = p.match("\n" + lower_char)
+    assert q
+
+    p = re.compile("." + lower_char, re.I | re.S)
+    q = p.match("\n" + upper_char)
+    assert q
+
+    p = re.compile("(?i)." + upper_char, re.S)
+    q = p.match("\n" + lower_char)
+    assert q
+
+    p = re.compile("(?i)." + lower_char, re.S)
+    q = p.match("\n" + upper_char)
+    assert q
+
+    p = re.compile("(?is)." + upper_char)
+    q = p.match("\n" + lower_char)
+    assert q
+
+    p = re.compile("(?is)." + lower_char)
+    q = p.match("\n" + upper_char)
+    assert q
+
+    p = re.compile("(?s)(?i)." + upper_char)
+    q = p.match("\n" + lower_char)
+    assert q
+
+    p = re.compile("(?s)(?i)." + lower_char)
+    q = p.match("\n" + upper_char)
+    assert q
+
+    assert re.match("(?ix) " + upper_char, lower_char)
+    assert re.match("(?ix) " + lower_char, upper_char)
+    assert re.match(" (?i) " + upper_char, lower_char, re.X)
+    assert re.match("(?x) (?i) " + upper_char, lower_char)
+    assert re.match(" (?x) (?i) " + upper_char, lower_char, re.X)
+
+
+def test_dollar_matches_twice():
+    r"""Test that $ does not include \n
+    $ matches the end of string, and just before the terminating \n"""
+    pattern = re.compile("$")
+    assert pattern.sub("#", "a\nb\n") == "a\nb#\n#"
+    assert pattern.sub("#", "a\nb\nc") == "a\nb\nc#"
+    assert pattern.sub("#", "\n") == "#\n#"
+
+    pattern = re.compile("$", re.MULTILINE)
+    assert pattern.sub("#", "a\nb\n") == "a#\nb#\n#"
+    assert pattern.sub("#", "a\nb\nc") == "a#\nb#\nc#"
+    assert pattern.sub("#", "\n") == "#\n#"
+
+
+def test_bytes_str_mixing():
+    # Mixing str and bytes is disallowed
+    pat = re.compile(".")
+    bpat = re.compile(b".")
+    assert_raises(TypeError, pat.match, b"b")
+    assert_raises(TypeError, bpat.match, "b")
+    assert_raises(TypeError, pat.sub, b"b", "c")
+    assert_raises(TypeError, pat.sub, "b", b"c")
+    assert_raises(TypeError, pat.sub, b"b", b"c")
+    assert_raises(TypeError, bpat.sub, b"b", "c")
+    assert_raises(TypeError, bpat.sub, "b", b"c")
+    assert_raises(TypeError, bpat.sub, "b", "c")
+
+
+def test_ascii_and_unicode_flag():
+    # String patterns
+    for flags in (0, re.UNICODE):
+        pat = re.compile("\xc0", flags | re.IGNORECASE)
+        assert pat.match("\xe0")
+        pat = re.compile(r"\w", flags)
+        assert pat.match("\xe0")
+    pat = re.compile(r"\w", re.ASCII)
+    assert pat.match("\xe0") is None
+    pat = re.compile(r"(?a)\w")
+    assert pat.match("\xe0") is None
+    # Bytes patterns
+    for flags in (0, re.ASCII):
+        pat = re.compile(b"\xc0", flags | re.IGNORECASE)
+        assert pat.match(b"\xe0") is None
+        pat = re.compile(rb"\w", flags)
+        assert pat.match(b"\xe0") is None
+    # Incompatibilities
+    check_pattern_error(rb"(?u)\w")
+    assert_raises(re.PatternError, re.compile, r"(?u)\w", re.ASCII)
+    check_pattern_error(r"(?au)\w")
+
+
+def test_scoped_flags():
+    assert re.match(r"(?i:a)b", "Ab")
+    assert re.match(r"(?i:a)b", "aB") is None
+    assert re.match(r"(?-i:a)b", "Ab", re.IGNORECASE) is None
+    assert re.match(r"(?-i:a)b", "aB", re.IGNORECASE)
+    assert re.match(r"(?i:(?-i:a)b)", "Ab") is None
+    assert re.match(r"(?i:(?-i:a)b)", "aB")
+    assert re.match(r"\w(?a:\W)\w", "\xe0\xe0\xe0")
+
+    check_pattern_error(rb"(?aL:a)")
+    check_pattern_error(r"(?-")
+    check_pattern_error(r"(?-+")
+    check_pattern_error(r"(?-z")
+    check_pattern_error(r"(?-i")
+    check_pattern_error(r"(?-i+")
+    check_pattern_error(r"(?-iz")
+    check_pattern_error(r"(?i:")
+    check_pattern_error(r"(?i")
+    check_pattern_error(r"(?i+")
+    check_pattern_error(r"(?iz")
+
+
+def test_ignore_spaces():
+    for space in " \t\n\r\v\f":
+        assert re.fullmatch(space + "a", "a", re.VERBOSE)
+    for space in b" ", b"\t", b"\n", b"\r", b"\v", b"\f":
+        assert re.fullmatch(space + b"a", b"a", re.VERBOSE)
+    assert re.fullmatch("(?x) a", "a")
+    assert re.fullmatch(" (?x) a", "a", re.VERBOSE)
+    assert re.fullmatch("(?x) (?x) a", "a")
+    assert re.fullmatch(" a(?x: b) c", " ab c")
+    assert re.fullmatch(" a(?-x: b) c", "a bc", re.VERBOSE)
+    assert re.fullmatch("(?x) a(?-x: b) c", "a bc")
+    assert re.fullmatch("(?x) a| b", "a")
+    assert re.fullmatch("(?x) a| b", "b")
+
+
+def test_comments():
+    assert re.fullmatch("#x\na", "a", re.VERBOSE)
+    assert re.fullmatch(b"#x\na", b"a", re.VERBOSE)
+    assert re.fullmatch("(?x)#x\na", "a")
+    assert re.fullmatch("#x\n(?x)#y\na", "a", re.VERBOSE)
+    assert re.fullmatch("(?x)#x\n(?x)#y\na", "a")
+    assert re.fullmatch("#x\na(?x:#y\nb)#z\nc", "#x\nab#z\nc")
+    assert re.fullmatch("#x\na(?-x:#y\nb)#z\nc", "a#y\nbc", re.VERBOSE)
+    assert re.fullmatch("(?x)#x\na(?-x:#y\nb)#z\nc", "a#y\nbc")
+    assert re.fullmatch("(?x)#x\na|#y\nb", "a")
+    assert re.fullmatch("(?x)#x\na|#y\nb", "b")
+
+
+def test_bug_6509():
+    # Replacement strings of both types must parse properly.
+    # all strings
+    assert re.sub(r"a(\w)", "b\\1", "ac") == "bc"
+    assert re.sub("a(.)", "b\\1", "a\u1234") == "b\u1234"
+    assert re.sub("..", lambda m: "str", "a5") == "str"
+
+    # all bytes
+    assert re.sub(rb"a(\w)", b"b\\1", b"ac") == b"bc"
+    assert re.sub(b"a(.)", b"b\\1", b"a\xcd") == b"b\xcd"
+    assert re.sub(b"..", lambda m: b"bytes", b"a5") == b"bytes"
+
+
+def test_search_dot_unicode():
+    assert re.search("123.*-", "123abc-")
+    assert re.search("123.*-", "123\xe9-")
+    assert re.search("123.*-", "123\u20ac-")
+    assert re.search("123.*-", "123\U0010ffff-")
+    assert re.search("123.*-", "123\xe9\u20ac\U0010ffff-")
+
+
+def test_compile():
+    # Test return value when given string and pattern as parameter
+    pattern = re.compile("random pattern")
+    assert isinstance(pattern, re.Pattern)
+    same_pattern = re.compile(pattern)
+    assert isinstance(same_pattern, re.Pattern)
+    assert same_pattern is pattern
+    # Test behaviour when not given a string or pattern as parameter
+    assert_raises(TypeError, re.compile, 0)
+
+
+def test_large_search():
+    # Issue #10182: indices were 32-bit-truncated.
+    size = 2  # * 1024 ** 2  # TODO: Works but is expensive for iterative tests
+    s = "a" * size
+    m = re.search("$", s)
+    assert m is not None
+    assert m.start() == size
+    assert m.end() == size
+
+
+def test_large_subn():
+    # Issue #10182: indices were 32-bit-truncated.
+    size = 2  # * 1024 ** 2  # TODO: Works but is expensive for iterative tests
+    s = "a" * size
+    r, n = re.subn("", "", s)
+    assert r == s
+    assert n == size + 1
+
+
+def test_bug_16688():
+    # Issue 16688: Backreferences make case-insensitive regex fail on
+    # non-ASCII strings.
+    assert re.findall(r"(?i)(a)\1", "aa \u0100") == ["a"]
+    assert re.match(r"(?s).{1,3}", "\u0100\u0100").span() == (0, 2)
+
+
+def test_repeat_minmax_overflow():
+    # Issue #13169
+    string = "x" * 100000
+    assert re.match(r".{65535}", string).span() == (0, 65535)
+    assert re.match(r".{,65535}", string).span() == (0, 65535)
+    assert re.match(r".{65535,}?", string).span() == (0, 65535)
+
+
+def test_look_behind_overflow():
+    string = "x" * 2_500_000
+    p1 = r"(?<=((.{%d}){%d}){%d})"
+    p2 = r"(?<!((.{%d}){%d}){%d})"
+    # But 2**66 is too large for look-behind width.
+    assert_raises(re.error, re.compile, p1 % (2**22, 2**22, 2**22))
+    assert_raises(re.error, re.compile, p2 % (2**22, 2**22, 2**22))
+
+
+def test_issue17998():
+    for reps in "*", "+", "?", "{1}":
+        for mod in "", "?":
+            pattern = "." + reps + mod + "yz"
+            assert re.compile(pattern, re.S).findall("xyz") == ["xyz"]
+            pattern = pattern.encode()
+            assert re.compile(pattern, re.S).findall(b"xyz") == [b"xyz"]
+
+
+def test_match_repr():
+    for string in "[abracadabra]", S("[abracadabra]"):
+        m = re.search(r"(.+)(.*?)\1", string)
+        pattern = r"<(%s\.)?%s object; span=\(1, 12\), match='abracadabra'>" % (
+            type(m).__module__,
+            type(m).__qualname__,
+        )
+        assert re.search(pattern, repr(m))
+    for string in (
+        b"[abracadabra]",
+        B(b"[abracadabra]"),
+        bytearray(b"[abracadabra]"),
+        memoryview(b"[abracadabra]"),
+    ):
+        m = re.search(rb"(.+)(.*?)\1", string)
+        pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % (
+            type(m).__module__,
+            type(m).__qualname__,
+        )
+        assert re.search(pattern, repr(m))
+
+    first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
+    pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % (
+        type(second).__module__,
+        type(second).__qualname__,
+    )
+    assert re.search(pattern, repr(first))
+    pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % (
+        type(second).__module__,
+        type(second).__qualname__,
+    )
+    assert re.search(pattern, repr(second))
+
+
+def test_zerowidth():
+    # Issues 852532, 1647489, 3262, 25054.
+    assert re.split(r"\b", "a::bc") == ["", "a", "::", "bc", ""]
+    assert re.split(r"\b|:+", "a::bc") == ["", "a", "", "", "bc", ""]
+    assert re.split(r"(?<!\w)(?=\w)|:+", "a::bc") == ["", "a", "", "bc"]
+    assert re.split(r"(?<=\w)(?!\w)|:+", "a::bc") == ["a", "", "bc", ""]
+
+    assert re.sub(r"\b", "-", "a::bc") == "-a-::-bc-"
+    assert re.sub(r"\b|:+", "-", "a::bc") == "-a---bc-"
+    assert re.sub(r"(\b|:+)", r"[\1]", "a::bc") == "[]a[][::][]bc[]"
+
+    assert re.findall(r"\b|:+", "a::bc") == ["", "", "::", "", ""]
+    assert re.findall(r"\b|\w+", "a::bc") == ["", "a", "", "", "bc", ""]
+
+    assert [m.span() for m in re.finditer(r"\b|:+", "a::bc")] == [
+        (0, 0),
+        (1, 1),
+        (1, 3),
+        (3, 3),
+        (5, 5),
+    ]
+    assert [m.span() for m in re.finditer(r"\b|\w+", "a::bc")] == [
+        (0, 0),
+        (0, 1),
+        (1, 1),
+        (3, 3),
+        (3, 5),
+        (5, 5),
+    ]
+
+
+def test_bug_2537():
+    # issue 2537: empty submatches
+    for outer_op in ("{0,}", "*", "+", "{1,187}"):
+        for inner_op in ("{0,}", "*", "?"):
+            r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
+            m = r.match("xyyzy")
+            assert m.group(0) == "xyy"
+            assert m.group(1) == ""
+            assert m.group(2) == "y"
+
+
+def test_keyword_parameters():
+    # Issue #20283: Accepting the string keyword parameter.
+    pat = re.compile(r"(ab)")
+    assert pat.match(string="abracadabra", pos=7, endpos=10).span() == (7, 9)
+    assert pat.fullmatch(string="abracadabra", pos=7, endpos=9).span() == (7, 9)
+    assert pat.search(string="abracadabra", pos=3, endpos=10).span() == (7, 9)
+    assert pat.findall(string="abracadabra", pos=3, endpos=10) == ["ab"]
+    assert pat.split(string="abracadabra", maxsplit=1) == ["", "ab", "racadabra"]
+
+
+def test_bug_20998():
+    # Issue #20998: Fullmatch of repeated single character pattern
+    # with ignore case.
+    assert re.fullmatch("[a-c]+", "ABC", re.I).span() == (0, 3)
+
+
+def test_misc_errors():
+    check_pattern_error(r"(")
+    check_pattern_error(r"((a|b)")
+    check_pattern_error(r"(a|b))")
+    check_pattern_error(r"(?P")
+    check_pattern_error(r"(?z)")
+    check_pattern_error(r"(?iz)")
+    check_pattern_error(r"(?i")
+    check_pattern_error(r"(?#abc")
+    check_pattern_error(r"(?<")
+    check_pattern_error(r"(?<>)")
+    check_pattern_error(r"(?")
+
+
+def test_enum():
+    # Issue #28082: Check that str(flag) returns a human readable string
+    # instead of an integer
+    # TODO: Change representation of enums
+    # self.assertIn("IGNORECASE", str(re.I))
+    # self.assertIn("DOTALL", str(re.S))
+    pass
+
+
+def test_bug_34294():
+    # Issue 34294: wrong capturing groups
+    # exists since Python 2
+    s = "a\tx"
+    p = r"\b(?=(\t)|(x))x"
+    assert re.search(p, s).groups() == (None, "x")
+
+    # introduced in Python 3.7.0
+    s = "ab"
+    p = r"(?=(.)(.)?)"
+    assert re.findall(p, s), [("a", "b") == ("b", "")]
+    assert [m.groups() for m in re.finditer(p, s)], [("a", "b") == ("b", None)]
+
+    # test-cases provided by issue34294, introduced in Python 3.7.0
+    p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
+    s = "<test><foo2/></test>"
+    assert re.findall(p, s), [("test", "<foo2/>") == ("foo2", "")]
+    assert [m.groupdict() for m in re.finditer(p, s)] == [
+        {"tag": "test", "text": "<foo2/>"},
+        {"tag": "foo2", "text": None},
+    ]
+    s = "<test>Hello</test><foo/>"
+    assert [m.groupdict() for m in re.finditer(p, s)] == [
+        {"tag": "test", "text": "Hello"},
+        {"tag": "foo", "text": None},
+    ]
+    s = "<test>Hello</test><foo/><foo/>"
+    assert [m.groupdict() for m in re.finditer(p, s)] == [
+        {"tag": "test", "text": "Hello"},
+        {"tag": "foo", "text": None},
+        {"tag": "foo", "text": None},
+    ]
+
+
+def test_MARK_PUSH_macro_bug():
+    # issue35859, MARK_PUSH() macro didn't protect MARK-0 if it
+    # was the only available mark.
+    assert re.match(r"(ab|a)*?b", "ab").groups() == ("a",)
+    assert re.match(r"(ab|a)+?b", "ab").groups() == ("a",)
+    assert re.match(r"(ab|a){0,2}?b", "ab").groups() == ("a",)
+    assert re.match(r"(.b|a)*?b", "ab").groups() == ("a",)
+
+
+def test_MIN_UNTIL_mark_bug():
+    # Fixed in issue35859, reported in issue9134.
+    # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
+    s = "axxzbcz"
+    p = r"(?:(?:a|bc)*?(xx)??z)*"
+    assert re.match(p, s).groups() == ("xx",)
+
+    # test-case provided by issue9134
+    s = "xtcxyzxc"
+    p = r"((x|yz)+?(t)??c)*"
+    m = re.match(p, s)
+    assert m.span() == (0, 8)
+    assert m.span(2) == (6, 7)
+    assert m.groups() == ("xyzxc", "x", "t")
+
+
+def test_REPEAT_ONE_mark_bug():
+    # issue35859
+    # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat
+    s = "aabaab"
+    p = r"(?:[^b]*a(?=(b)|(a))ab)*"
+    m = re.match(p, s)
+    assert m.span() == (0, 6)
+    assert m.span(2) == (4, 5)
+    assert m.groups() == (None, "a")
+
+    # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat
+    s = "abab"
+    p = r"(?:[^b]*(?=(b)|(a))ab)*"
+    m = re.match(p, s)
+    assert m.span() == (0, 4)
+    assert m.span(2) == (2, 3)
+    assert m.groups() == (None, "a")
+
+    assert re.match(r"(ab?)*?b", "ab").groups() == ("a",)
+
+
+def test_MIN_REPEAT_ONE_mark_bug():
+    # issue35859
+    # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
+    s = "abab"
+    p = r"(?:.*?(?=(a)|(b))b)*"
+    m = re.match(p, s)
+    assert m.span() == (0, 4)
+    assert m.span(2) == (3, 4)
+    assert m.groups() == (None, "b")
+
+    s = "axxzaz"
+    p = r"(?:a*?(xx)??z)*"
+    assert re.match(p, s).groups() == ("xx",)
+
+
+def test_ASSERT_NOT_mark_bug():
+    # Fixed in issue35859, reported in issue725149.
+    # JUMP_ASSERT_NOT should LASTMARK_SAVE()
+    assert re.match(r"(?!(..)c)", "ab").groups() == (None,)
+
+    # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
+    m = re.match(r"((?!(ab)c)(.))*", "abab")
+    assert m.span() == (0, 4)
+    assert m.span(1) == (3, 4)
+    assert m.span(3) == (3, 4)
+    assert m.groups() == ("b", None, "b")
+
+
+def test_bug_40736():
+    with pytest.raises(TypeError):
+        re.search("x*", 5)
+    with pytest.raises(TypeError):
+        re.search("x*", type)
+
+
+def test_search_anchor_at_beginning():
+    s = "x" * 10**7
+    for p in r"\Ay", r"^y":
+        assert re.search(p, s) is None
+        assert re.split(p, s) == [s]
+        assert re.findall(p, s) == []
+        assert list(re.finditer(p, s)) == []
+        assert re.sub(p, "", s) == s
+
+
+def test_possessive_quantifiers():
+    """Test Possessive Quantifiers
+    Test quantifiers of the form @+ for some repetition operator @,
+    e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed
+    without creating a stack frame for rolling the stack back and
+    trying 1 or more fewer matches."""
+    assert re.match("e*+e", "eeee") is None
+    assert re.match("e++a", "eeea").group(0) == "eeea"
+    assert re.match("e?+a", "ea").group(0) == "ea"
+    assert re.match("e{2,4}+a", "eeea").group(0) == "eeea"
+    assert re.match("(.)++.", "ee") is None
+    assert re.match("(ae)*+a", "aea").groups() == ("ae",)
+    assert re.match("([ae][ae])?+a", "aea").groups() == ("ae",)
+    assert re.match("(e?){2,4}+a", "eeea").groups() == ("",)
+    assert re.match("()*+a", "a").groups() == ("",)
+    assert re.search("x*+", "axx").span() == (0, 0)
+    assert re.search("x++", "axx").span() == (1, 3)
+    assert re.match("a*+", "xxx").span() == (0, 0)
+    assert re.match("x*+", "xxxa").span() == (0, 3)
+    assert re.match("a++", "xxx") is None
+    assert re.match(r"^(\w){1}+$", "abc") is None
+    assert re.match(r"^(\w){1,2}+$", "abc") is None
+
+    assert re.match(r"^(\w){3}+$", "abc").group(1) == "c"
+    assert re.match(r"^(\w){1,3}+$", "abc").group(1) == "c"
+    assert re.match(r"^(\w){1,4}+$", "abc").group(1) == "c"
+
+    assert re.match("^x{1}+$", "xxx") is None
+    assert re.match("^x{1,2}+$", "xxx") is None
+
+    assert re.match("^x{3}+$", "xxx")
+    assert re.match("^x{1,3}+$", "xxx")
+    assert re.match("^x{1,4}+$", "xxx")
+
+    assert re.match("^x{}+$", "xxx") is None
+    assert re.match("^x{}+$", "x{}")
+
+
+def test_fullmatch_possessive_quantifiers():
+    assert re.fullmatch(r"a++", "a")
+    assert re.fullmatch(r"a*+", "a")
+    assert re.fullmatch(r"a?+", "a")
+    assert re.fullmatch(r"a{1,3}+", "a")
+    assert re.fullmatch(r"a++", "ab") is None
+    assert re.fullmatch(r"a*+", "ab") is None
+    assert re.fullmatch(r"a?+", "ab") is None
+    assert re.fullmatch(r"a{1,3}+", "ab") is None
+    assert re.fullmatch(r"a++b", "ab")
+    assert re.fullmatch(r"a*+b", "ab")
+    assert re.fullmatch(r"a?+b", "ab")
+    assert re.fullmatch(r"a{1,3}+b", "ab")
+
+    assert re.fullmatch(r"(?:ab)++", "ab")
+    assert re.fullmatch(r"(?:ab)*+", "ab")
+    assert re.fullmatch(r"(?:ab)?+", "ab")
+    assert re.fullmatch(r"(?:ab){1,3}+", "ab")
+    assert re.fullmatch(r"(?:ab)++", "abc") is None
+    assert re.fullmatch(r"(?:ab)*+", "abc") is None
+    assert re.fullmatch(r"(?:ab)?+", "abc") is None
+    assert re.fullmatch(r"(?:ab){1,3}+", "abc") is None
+    assert re.fullmatch(r"(?:ab)++c", "abc")
+    assert re.fullmatch(r"(?:ab)*+c", "abc")
+    assert re.fullmatch(r"(?:ab)?+c", "abc")
+    assert re.fullmatch(r"(?:ab){1,3}+c", "abc")
+
+
+def test_findall_possessive_quantifiers():
+    assert re.findall(r"a++", "aab") == ["aa"]
+    assert re.findall(r"a*+", "aab") == ["aa", "", ""]
+    assert re.findall(r"a?+", "aab") == ["a", "a", "", ""]
+    assert re.findall(r"a{1,3}+", "aab") == ["aa"]
+
+    assert re.findall(r"(?:ab)++", "ababc") == ["abab"]
+    assert re.findall(r"(?:ab)*+", "ababc") == ["abab", "", ""]
+    assert re.findall(r"(?:ab)?+", "ababc") == ["ab", "ab", "", ""]
+    assert re.findall(r"(?:ab){1,3}+", "ababc") == ["abab"]
+
+
+def test_atomic_grouping():
+    """Test Atomic Grouping
+    Test non-capturing groups of the form (?>...), which does
+    not maintain any stack point created within the group once the
+    group is finished being evaluated."""
+    pattern1 = re.compile(r"a(?>bc|b)c")
+    assert pattern1.match("abc") is None
+    assert pattern1.match("abcc")
+    assert re.match(r"(?>.*).", "abc") is None
+    assert re.match(r"(?>x)++", "xxx")
+    assert re.match(r"(?>x++)", "xxx")
+    assert re.match(r"(?>x)++x", "xxx") is None
+    assert re.match(r"(?>x++)x", "xxx") is None
+
+
+def test_fullmatch_atomic_grouping():
+    assert re.fullmatch(r"(?>a+)", "a")
+    assert re.fullmatch(r"(?>a*)", "a")
+    assert re.fullmatch(r"(?>a?)", "a")
+    assert re.fullmatch(r"(?>a{1,3})", "a")
+    assert re.fullmatch(r"(?>a+)", "ab") is None
+    assert re.fullmatch(r"(?>a*)", "ab") is None
+    assert re.fullmatch(r"(?>a?)", "ab") is None
+    assert re.fullmatch(r"(?>a{1,3})", "ab") is None
+    assert re.fullmatch(r"(?>a+)b", "ab")
+    assert re.fullmatch(r"(?>a*)b", "ab")
+    assert re.fullmatch(r"(?>a?)b", "ab")
+    assert re.fullmatch(r"(?>a{1,3})b", "ab")
+
+    assert re.fullmatch(r"(?>(?:ab)+)", "ab")
+    assert re.fullmatch(r"(?>(?:ab)*)", "ab")
+    assert re.fullmatch(r"(?>(?:ab)?)", "ab")
+    assert re.fullmatch(r"(?>(?:ab){1,3})", "ab")
+    assert re.fullmatch(r"(?>(?:ab)+)", "abc") is None
+    assert re.fullmatch(r"(?>(?:ab)*)", "abc") is None
+    assert re.fullmatch(r"(?>(?:ab)?)", "abc") is None
+    assert re.fullmatch(r"(?>(?:ab){1,3})", "abc") is None
+    assert re.fullmatch(r"(?>(?:ab)+)c", "abc")
+    assert re.fullmatch(r"(?>(?:ab)*)c", "abc")
+    assert re.fullmatch(r"(?>(?:ab)?)c", "abc")
+    assert re.fullmatch(r"(?>(?:ab){1,3})c", "abc")
+
+
+def test_findall_atomic_grouping():
+    assert re.findall(r"(?>a+)", "aab") == ["aa"]
+    assert re.findall(r"(?>a*)", "aab") == ["aa", "", ""]
+    assert re.findall(r"(?>a?)", "aab") == ["a", "a", "", ""]
+    assert re.findall(r"(?>a{1,3})", "aab") == ["aa"]
+
+    assert re.findall(r"(?>(?:ab)+)", "ababc") == ["abab"]
+    assert re.findall(r"(?>(?:ab)*)", "ababc") == ["abab", "", ""]
+    assert re.findall(r"(?>(?:ab)?)", "ababc") == ["ab", "ab", "", ""]
+    assert re.findall(r"(?>(?:ab){1,3})", "ababc") == ["abab"]
+
+
+def test_bug_gh91616():
+    assert re.fullmatch(r"(?s:(?>.*?\.).*)\z", "a.txt")  # reproducer
+    assert re.fullmatch(r"(?s:(?=(?P<g0>.*?\.))(?P=g0).*)\z", "a.txt")
+
+
+def test_bug_gh100061():
+    # gh-100061
+    assert re.match("(?>(?:.(?!D))+)", "ABCDE").span() == (0, 2)
+    assert re.match("(?:.(?!D))++", "ABCDE").span() == (0, 2)
+    assert re.match("(?>(?:.(?!D))*)", "ABCDE").span() == (0, 2)
+    assert re.match("(?:.(?!D))*+", "ABCDE").span() == (0, 2)
+    assert re.match("(?>(?:.(?!D))?)", "CDE").span() == (0, 0)
+    assert re.match("(?:.(?!D))?+", "CDE").span() == (0, 0)
+    assert re.match("(?>(?:.(?!D)){1,3})", "ABCDE").span() == (0, 2)
+    assert re.match("(?:.(?!D)){1,3}+", "ABCDE").span() == (0, 2)
+    # gh-106052
+    assert re.match("(?>(?:ab?c)+)", "aca").span() == (0, 2)
+    assert re.match("(?:ab?c)++", "aca").span() == (0, 2)
+    assert re.match("(?>(?:ab?c)*)", "aca").span() == (0, 2)
+    assert re.match("(?:ab?c)*+", "aca").span() == (0, 2)
+    assert re.match("(?>(?:ab?c)?)", "a").span() == (0, 0)
+    assert re.match("(?:ab?c)?+", "a").span() == (0, 0)
+    assert re.match("(?>(?:ab?c){1,3})", "aca").span() == (0, 2)
+    assert re.match("(?:ab?c){1,3}+", "aca").span() == (0, 2)
+
+
+def test_bug_gh101955():
+    # Possessive quantifier with nested alternative with capture groups
+    assert re.match("((x)|y|z)*+", "xyz").groups() == ("z", "x")
+    assert re.match("((x)|y|z){3}+", "xyz").groups() == ("z", "x")
+    assert re.match("((x)|y|z){3,}+", "xyz").groups() == ("z", "x")
+
+
+def test_regression_gh94675():
+    # TODO: Multiprocessing requires pickling
+    pattern = re.compile(
+        r"(?<=[({}])(((//[^\n]*)?[\n])([\000-\040])*)*"
+        r"((/[^/\[\n]*(([^\n]|(\[\n]*(]*)*\]))"
+        r"[^/\[]*)*/))((((//[^\n]*)?[\n])"
+        r"([\000-\040]|(/\*[^*]*\*+"
+        r"([^/*]\*+)*/))*)+(?=[^\000-\040);\]}]))"
+    )
+    input_js = """a(function() {
+        ///////////////////////////////////////////////////////////////////
+    });"""
+    p = multiprocessing.Process(target=pattern.sub, args=("", input_js))
+    p.start()
+    p.join(30.0)
+    try:
+        assert not p.is_alive(), "pattern.sub() timed out"
+    finally:
+        if p.is_alive():
+            p.terminate()
+            p.join()
+
+
+def test_fail():
+    assert re.search(r"12(?!)|3", "123")[0] == "3"
+
+
+def test_character_set_any():
+    # The union of complementary character sets matches any character
+    # and is equivalent to "(?s:.)".
+    s = "1x\n"
+    for p in r"[\s\S]", r"[\d\D]", r"[\w\W]", r"[\S\s]", r"\s|\S":
+        assert re.findall(p, s) == list(s)
+        assert re.fullmatch("(?:" + p + ")+", s).group() == s
+
+
+def test_character_set_none():
+    # Negation of the union of complementary character sets does not match
+    # any character.
+    s = "1x\n"
+    for p in r"[^\s\S]", r"[^\d\D]", r"[^\w\W]", r"[^\S\s]":
+        assert re.search(p, s) is None
+        assert re.search("(?s:.)" + p, s) is None